Polysubstance and tr. completion

Analyze the association between polysubstance at admission and tr. compleiton longitudinally along treatments, accounting for irregular observations.

Author

Andrés González Santa Cruz

Published

September 25, 2025


Data Loading and Exploration

Loading Packages and uniting databases

Proceed to load the necessary packages.

Code
# invisible("Only run from Ubuntu")
# if (!(Sys.getenv("RSTUDIO_SESSION_TYPE") == "server" || file.exists("/.dockerenv"))) {
#   if(Sys.info()["sysname"]!="Windows"){
#     Sys.setenv(RETICULATE_PYTHON = "/home/fondecytacc/.pyenv/versions/3.11.5/bin/python")
#   }
# }

#clean enviroment
rm(list = ls()); gc()

time_before_dedup2<-Sys.time()

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# --- Bootstrap reticulate con ruta relativa a getwd() ---
suppressPackageStartupMessages(library(reticulate))

# Busca .mamba_root/envs/py311/python.exe desde getwd() hacia padres
find_python_rel <- function(start = getwd(),
                            rel = file.path(".mamba_root","envs","py311","python.exe")) {
  cur <- normalizePath(start, winslash = "/", mustWork = FALSE)
  repeat {
    cand <- normalizePath(file.path(cur, rel), winslash = "/", mustWork = FALSE)
    if (file.exists(cand)) return(cand)
    parent <- dirname(cur)
    if (identical(parent, cur)) return(NA_character_)  # llegó a la raíz
    cur <- parent
  }
}

py <- find_python_rel()

if (is.na(py)) {
  stop("No se encontró Python relativo a getwd() (buscando '.mamba_root/envs/py311/python.exe').\n",
       "Directorio actual: ", getwd())
}

# Forzar ese intérprete
Sys.unsetenv(c("RETICULATE_CONDAENV","RETICULATE_PYTHON_FALLBACK"))
Sys.setenv(RETICULATE_PYTHON = py)
reticulate::use_python(py, required=T)

py_config()  # verificación

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
try(load(paste0(wdpath,"data/20241015_out/psu/","SISTRAT_database_sep25_prevt.RData")))

SISTRAT23_c1_2010_2024_df_prev1w<- 
try(readRDS(paste0(wdpath,"data/20241015_out/psu/","SISTRAT23_c1_2010_2024_df_prev1w.rds")))

df23_ndp_20250824_SISTRAT23_c1_1024<- 
 readRDS(paste0(wdpath,"data/20241015_out/psu/","23_ndp_20250824_SISTRAT23_c1_1024_df2.rds"))
          used (Mb) gc trigger (Mb) max used (Mb)
Ncells  601868 32.2    1294624 69.2   966921 51.7
Vcells 1157544  8.9    8388608 64.0  1876154 14.4
python:         G:/My Drive/Alvacast/SISTRAT 2023/.mamba_root/envs/py311/python.exe
libpython:      G:/My Drive/Alvacast/SISTRAT 2023/.mamba_root/envs/py311/python311.dll
pythonhome:     G:/My Drive/Alvacast/SISTRAT 2023/.mamba_root/envs/py311
version:        3.11.5 | packaged by conda-forge | (main, Aug 27 2023, 03:23:48) [MSC v.1936 64 bit (AMD64)]
Architecture:   64bit
numpy:           [NOT FOUND]

NOTE: Python version was forced by RETICULATE_PYTHON
Code
#https://github.com/rstudio/renv/issues/544
#renv falls back to copying rather than symlinking, which is evidently very slow in this configuration.
renv::settings$use.cache(FALSE)

#only use explicit dependencies (in DESCRIPTION)
renv::settings$snapshot.type("implicit")

#check if rstools is installed
try(installr::install.Rtools(check_r_update=F))

Installing package into ‘G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32’ (as ‘lib’ is unspecified)

Code
check_quarto_version <- function(required = "1.7.29", comparator = c("ge","gt","le","lt","eq")) {
  comparator <- match.arg(comparator)
  current <- package_version(paste(unlist(quarto::quarto_version()), collapse = "."))
  req     <- package_version(required)

  ok <- switch(comparator,
               ge = current >= req,
               gt = current >  req,
               le = current <= req,
               lt = current <  req,
               eq = current == req)

  if (!ok) {
    stop(sprintf("Quarto version check failed: need %s %s (installed: %s).",
                 comparator, required, current), call. = FALSE)
  }
  invisible(TRUE)
}

check_quarto_version("1.7.29", "ge") 

#change repository to CL
local({
  r <- getOption("repos")
  r["CRAN"] <- "https://cran.dcc.uchile.cl/"
  options(repos=r)
})

if(!require(pacman)){install.packages("pacman");require(pacman)}

Cargando paquete requerido: pacman

Code
if(!require(pak)){install.packages("pak");require(pak)}

Cargando paquete requerido: pak

Code
pacman::p_unlock(lib.loc = .libPaths()) #para no tener problemas reinstalando paquetes

No 00LOCK detected in: G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32 No 00LOCK detected in: C:/Program Files/R/R-4.4.1/library

Code
if(Sys.info()["sysname"]=="Windows"){
if (getRversion() != "4.4.1") { stop("Requires R version 4.4.1; Actual: ", getRversion()) }
}

#check docker
check_docker_running <- function() {
  # Try running 'docker info' to check if Docker is running
  system("docker info", intern = TRUE, ignore.stderr = TRUE)
}

install_docker <- function() {
  # Open the Docker Desktop download page in the browser for installation
  browseURL("https://www.docker.com/products/docker-desktop")
}

# Main logic
if (inherits(try(check_docker_running(), silent = TRUE), "try-error")) {
  liftr::install_docker()
} else {
  message("Docker is running.")
}

Warning in system(“docker info”, intern = TRUE, ignore.stderr = TRUE): el comando ejecutado ‘docker info’ tiene el estatus 1

Docker is running.

Code
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#PACKAGES#######################################################################
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

unlink("*_cache", recursive=T)

# ----------------------------------------------------------------------
# 2. Use a single pak::pkg_install() call for most CRAN packages
# ----------------------------------------------------------------------

paks <-
  c(#"git", 
    # To connect to github
    "gh", #interface for  GitHub API from R
    #
    "gitcreds", # manages Git credentials (usernames, passwords, tokens)
    #
    "usethis", # simplifies common project setup tasks for R developers
    # Package to bring packages in development
    "devtools",
    # Package administration
    "renv",
    # To manipulate data
    "knitr", "pander", "DT",
    # Join
    "fuzzyjoin", "RecordLinkage",
    # For tables
    "tidyverse", "janitor",
    # For contingency tables
    "kableExtra",
    # For connections with python
    "reticulate",
    # To manipulate big data
    "polars", "sqldf",
    # To bring big databases
    "nanoparquet",
    # Interface for R and RStudio in R
    "installr", "rmarkdown", "quarto", "yaml", #"rstudioapi",
    # Time handling
    "clock",
    # Combine plots
    "ggpubr",
    # Parallelized iterative processing
    "furrr",
    # Work like a tibble with a data.table database
    "tidytable",
    # Split database into training and testing
    "caret",
    # Impute missing data
    "missRanger", "mice",
    # To modularize tasks
    "job",
    # For PhantomJS install checks
    "webshot"
  )

# dplyr
# janitor
# reshape2
# tidytable
# arrow
# boot
# broom
# car
# caret
# data.table
# DiagrammeR
# DiagrammeRsvg
# dplyr
# epiR
# epitools
# ggplot2
# glue
# htmlwidgets
# knitr
# lubridate
# naniar
# parallel
# polycor
# pROC
# psych
# readr
# rio
# rsvg
# scales
# stringr
# tableone
# rmarkdown
# biostat3
# codebook
# finalfit
# Hmisc
# kableExtra
# knitr
# devtools
# tidyr
# stringi
# stringr
# muhaz
# sqldf
# compareGroups
# survminer
# lubridate
# ggfortify
# car
# fuzzyjoin
# compareGroups
# caret
# job
# htmltools
# nanoparquet
# ggpubr
# polars
# installr
# clock
# pander
# reshape
# mice
# missRanger
# VIM
# withr
# biostat3
# broom
# glue
# finalfit
# purrr
# sf


# pak::pkg_install(paks)

pak::pak_sitrep()
# pak::sysreqs_check_installed(unique(unlist(paks)))
#pak::lockfile_create(unique(unlist(paks)),  "dependencies_duplicates24.lock", dependencies=T)
#pak::lockfile_install("dependencies_duplicates24.lock")
#https://rdrr.io/cran/pak/man/faq.html
#pak::cache_delete()

library(tidytable)

Adjuntando el paquete: ‘tidytable’

The following objects are masked from ‘package:stats’:

dt, filter, lag

The following object is masked from ‘package:base’:

%in%
Code
library(ggplot2)
library(readr)
library(survival)

cat("Install IrregLong\n")
if (!requireNamespace("IrregLong", quietly = TRUE)) {
  install.packages("IrregLong")
}
cat("Install geepack\n")
if (!requireNamespace("geepack", quietly = TRUE)) {
  install.packages("geepack")
}
# ----------------------------------------------------------------------
# 3. Activate polars code completion (safe to try even if it fails)
# ----------------------------------------------------------------------
#try(polars_code_completion_activate())

# ----------------------------------------------------------------------
# 4. BPMN from GitHub (not on CRAN, so install via devtools if missing)
# ----------------------------------------------------------------------
if (!requireNamespace("bpmn", quietly = TRUE)) {
  devtools::install_github("bergant/bpmn")
}


# ----------------------------------------------------------------------
# 5. PhantomJS Check (use webshot if PhantomJS is missing)
# ----------------------------------------------------------------------
# if (!webshot::is_phantomjs_installed()) {
#   webshot::install_phantomjs()
# }


# Memory management function
manage_memory <- function() {
    gc()
    if (.Platform$OS.type == "windows") {
        tryCatch({
            memory.limit(size = memory.limit() + 1000)
        }, error = function(e) NULL)
    }
}

manage_memory()

Warning: ‘memory.limit()’ is no longer supported

Code
check_memory <- function() {
    if (.Platform$OS.type == "windows") {
        mem <- memory.size()
        limit <- memory.limit()
        log_message(paste("Memory usage:", round(mem, 1), "MB /", limit, "MB"))
        return(mem/limit > 0.8)
    }
    return(FALSE)
}

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#FUNCTIONS######################################################################
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#
# NO MORE DEBUGS
options(error = NULL)        # si antes tenías options(error = recover) o browser)
options(browserNLdisabled = FALSE)


#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#
#NAs are replaced with "" in knitr kable
options(knitr.kable.NA = '')

pander::panderOptions('big.mark', ',')
pander::panderOptions('decimal.mark', '.')

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#

#to format rows in bold
format_cells <- function(df, rows ,cols, value = c("italics", "bold", "strikethrough")){

  # select the correct markup
  # one * for italics, two ** for bold
  map <- setNames(c("*", "**", "~~"), c("italics", "bold", "strikethrough"))
  markup <- map[value]  

  for (r in rows){
    for(c in cols){

      # Make sure values are not factors
      df[[c]] <- as.character( df[[c]])

      # Update formatting
      df[r, c] <- ifelse(nchar(df[r, c])==0,"",paste0(markup, gsub(" ", "", df[r, c]), markup))
    }
  }

  return(df)
}
#To produce line breaks in messages and warnings
knitr::knit_hooks$set(
   error = function(x, options) {
     paste('\n\n<div class="alert alert-danger" style="font-size: small !important;">',
           gsub('##', '\n', gsub('^##\ Error', '**Error**', x)),
           '</div>', sep = '\n')
   },
   warning = function(x, options) {
     paste('\n\n<div class="alert alert-warning" style="font-size: small !important;">',
           gsub('##', '\n', gsub('^##\ Warning:', '**Warning**', x)),
           '</div>', sep = '\n')
   },
   message = function(x, options) {
     paste('<div class="message" style="font-size: small !important;">',
           gsub('##', '\n', x),
           '</div>', sep = '\n')
   }
)

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#

sum_dates <- function(x){
  cbind.data.frame(
    min= as.Date(min(unclass(as.Date(x)), na.rm=T), origin = "1970-01-01"),
    p001= as.Date(quantile(unclass(as.Date(x)), .001, na.rm=T), origin = "1970-01-01"),
    p005= as.Date(quantile(unclass(as.Date(x)), .005, na.rm=T), origin = "1970-01-01"),
    p025= as.Date(quantile(unclass(as.Date(x)), .025, na.rm=T), origin = "1970-01-01"),
    p25= as.Date(quantile(unclass(as.Date(x)), .25, na.rm=T), origin = "1970-01-01"),
    p50= as.Date(quantile(unclass(as.Date(x)), .5, na.rm=T), origin = "1970-01-01"),
    p75= as.Date(quantile(unclass(as.Date(x)), .75, na.rm=T), origin = "1970-01-01"),
    p975= as.Date(quantile(unclass(as.Date(x)), .975, na.rm=T), origin = "1970-01-01"),
    p995= as.Date(quantile(unclass(as.Date(x)), .995, na.rm=T), origin = "1970-01-01"),
    p999= as.Date(quantile(unclass(as.Date(x)), .999, na.rm=T), origin = "1970-01-01"),
    max= as.Date(max(unclass(as.Date(x)), na.rm=T), origin = "1970-01-01")
  )
}

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

# Define the function adapted for Polars
sum_dates_polars <- function(df, date_col) {
  # Create the list of quantiles
  quantiles <- c(0.001, 0.005, 0.025, 0.25, 0.5, 0.75, 0.975, 0.995, 0.999)
  # Create expressions to calculate min and max
  expr_list <- list(
    pl$col(date_col)$min()$alias("min"),
    pl$col(date_col)$max()$alias("max")
  )
  # Add expressions for quantiles
  for (q in quantiles) {
    expr_list <- append(expr_list, pl$col(date_col)$quantile(q)$alias(paste0("p", sub("\\.", "", as.character(q)))))
  }
  # Apply the expressions and return a DataFrame with the results
  df$select(expr_list)
}

# Custom function for sampling with a seed
sample_n_with_seed <- function(data, size, seed) {
  set.seed(seed)
  dplyr::sample_n(data, size)
}

# Function to get the most frequent value 
most_frequent <- function(x) { 
  uniq_vals <- unique(x)
  freq_vals <- sapply(uniq_vals, function(val) sum(x == val))
  most_freq <- uniq_vals[which(freq_vals == max(freq_vals))]
  
  if (length(most_freq) == 1) {
    return(most_freq)
  } else {
    return(NA)
  }
}
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#CONFIG #######################################################################
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

options(scipen=2) #display numbers rather scientific number

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

# Define the function first
#joins these values with semicolons and optionally truncates the result if it exceeds a specified width.
toString2 <- function(x, width = NULL, ...) {
    string <- paste(x, collapse = "; ")
    if (missing(width) || is.null(width) || width == 0) 
        return(string)
    if (width < 0) 
        stop("'width' must be positive")
    if (nchar(string, type = "w") > width) {
        width <- max(6, width)
        string <- paste0(substr(string, 1, width - 3), "...")
    }
    string
}
normalize_txt <- function(x) {
  x|>
    stringi::stri_trans_general("Latin-ASCII")|>
    tolower()|>
    trimws()
}

icd10_broad <- function(x) {
x <- tolower(x)
tidytable::case_when(
stringr::str_detect(x, "organic|organico|demenc|alzheimer|parkinson|delirium|cerebral") ~ "F0 Organic",
stringr::str_detect(x, "psicoactiva|alcohol|marihuana|canabis|cannabis|cocain|opio|opiace|benzodiazep|sustancias") ~ "F1 Substance use",
stringr::str_detect(x, "esquizofren|psicotip|delirant|psicosis") ~ "F2 Psychotic",
stringr::str_detect(x, "estado de animo|afectiv|depres|bipolar|maniaco|distimia|hipoman") ~ "F3 Mood",
stringr::str_detect(x, "neurotic|ansiedad|fobi|panico|obsesivo|compulsiv|estres|adaptaci|somatoform|somatiz") ~ "F4 Anxiety/Stress/Somatoform",
stringr::str_detect(x, "comportamiento.*fisiolog|alimentari|anorex|bulim|sueñ|insomni|disfuncion sexual") ~ "F5 Physio/Eating/Sleep/Sexual",
stringr::str_detect(x, stringr::regex("personalidad|comportamiento del adulto|antisocial|limite|evitativ|habit|habitos|impuls|control de los impulsos|control\\s+de\\s+impulsos", ignore_case = TRUE)) ~ "F6 Personality/Adult behaviour",
stringr::str_detect(x, "retraso mental|discapacidad intelectual|intelectual") ~ "F7 Intellectual disability",
stringr::str_detect(x, "desarrollo psicolog|autism|asperger|lenguaje|aprendizaje|espectro autista|tdah|t\\s*d\\s*a\\s*h") ~ "F8/9 Neurodevelopment/Child",
TRUE ~ "Other/unspecified"
)
}
dsmiv_broad <- function(x) {
x <- tolower(x)
tidytable::case_when(
stringr::str_detect(x, "sustancia|alcohol|marihuana|cannabis|cocain|opio|opiace|benzodiazep") ~ "Substance-related",
stringr::str_detect(x, "esquizofren|psicosis|psicot") ~ "Psychotic",
stringr::str_detect(x, "estado de animo|afectiv|depres|bipolar|maniaco|distimia|hipoman") ~ "Mood",
stringr::str_detect(x, "ansiedad|fobi|panico|obsesivo|compulsiv|estres|adaptaci") ~ "Anxiety/Stress/Adjustment",
stringr::str_detect(x, "somatoform|somatiz|disociativ|conversion") ~ "Somatoform/Dissociative",
stringr::str_detect(x, "alimentari|anorex|bulim|sueñ|insomni|disfuncion sexual|para[f|f]ilia|sexual") ~ "Eating/Sleep/Sexual",
stringr::str_detect(x, "personalidad|antisocial|limite|borderline|paranoide|evitativ|dependient|narcis") ~ "Personality",
stringr::str_detect(x, "retraso mental|discapacidad intelectual|intelectual") ~ "Intellectual disability",
stringr::str_detect(x, "desarrollo|autism|asperger|infancia|tdah|t\\s*d\\s*a\\s*h|lenguaje|aprendizaje") ~ "Neurodevelopment/Childhood-onset",
TRUE ~ "Other/unspecified"
)
}

pair_str <- function(main, sub) {
  main <- as.character(main)
  sub  <- as.character(sub)
  both_na <- is.na(main) & is.na(sub)

  out <- paste0(
    tidytable::coalesce(main, "NA"),
    "::",
    tidytable::coalesce(sub,  "NA")
  )

  out[both_na] <- NA_character_
  out
}

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# IIW weight functions

ess <- function(w) {
    w <- w[!is.na(w)]
    cv2 <- var(w) / (mean(w)^2)
    length(w) / (1 + cv2)
}
summ_w <- function(w, name, lo = 0.01, hi = 0.99) {
    tibble::tibble(
        #weight_set = name,
        n_non_na   = sum(!is.na(w)),
        n_na       = sum(is.na(w)),
        min        = min(w, na.rm=TRUE),
        q01        = stats::quantile(w, probs=0.01, na.rm=TRUE),
        q25        = stats::quantile(w, probs=0.25, na.rm=TRUE),
        median     = stats::median(w, na.rm=TRUE),
        mean       = mean(w, na.rm=TRUE),
        q75        = stats::quantile(w, probs=0.75, na.rm=TRUE),
        q99        = stats::quantile(w, probs=0.99, na.rm=TRUE),
        max        = max(w, na.rm=TRUE),
        prop_lt_0.2= mean(w < 0.2, na.rm=TRUE),
        prop_gt_5  = mean(w > 5,   na.rm=TRUE),
        ESS        = ess(w)
    )
}
# truncated versions (1st–99th) for sensitivity
cap <- function(w, p = c(0.01, 0.99)) {
  b <- stats::quantile(w, probs=p, na.rm=TRUE)
  pmin(pmax(w, b[1]), b[2])
}
renorm <- function(w) w / mean(w, na.rm = TRUE)


#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_

counting_process_output<-
  function(object=NULL){
    broom::tidy(object$m, exponentiate=T, conf.int=T) %>% 
      dplyr::mutate(across(c("estimate","std.error", "statistic","conf.low","conf.high"),~sprintf("%1.2f",.))) %>% 
      dplyr::mutate(p.value= sprintf("%1.4f",p.value))
  }

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

tidy_coxph <- function(model, conf.level = 0.95) {
  # Extract the coefficients (a named numeric vector)
  coefs <- model$coefficients
  
  # Compute robust standard errors from the robust variance matrix
  robust_se <- sqrt(diag(model$var))
  
  # Compute z statistics and two-sided p-values
  z <- coefs / robust_se
  p_value <- 2 * (1 - pnorm(abs(z)))
  
  # Compute the critical z value for the specified confidence level
  z_crit <- qnorm(1 - (1 - conf.level) / 2)
  
  # Exponentiate coefficients to get hazard ratios
  hazard_ratio <- exp(coefs)
  
  # Compute the lower and upper confidence intervals (exponentiated)
  lower_ci <- exp(coefs - z_crit * robust_se)
  upper_ci <- exp(coefs + z_crit * robust_se)
  
  # Build and return a tidy data frame
  result <- data.frame(
    term         = names(coefs),
    coef         = coefs,
    robust_se    = robust_se,
    hazard_ratio = hazard_ratio,
    lower_ci     = lower_ci,
    upper_ci     = upper_ci,
    z            = z,
    p_value      = p_value,
    stringsAsFactors = FALSE
  )
  
  return(result)
}

format_hr_ci <- function(tidy_table, digits = 2) {
  # Build a format string dynamically using the desired number of digits.
  fmt <- paste0("%.", digits, "f (%.", digits, "f, %.", digits, "f)")
  
  # Create the new column using sprintf to format each row.
  tidy_table$hr_ci <- with(tidy_table, sprintf(fmt, hazard_ratio, lower_ci, upper_ci))
  
  return(tidy_table)
}

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
Error in contrib.url(repos, "source") : 
  trying to use CRAN without setting a mirror
* pak version:
- 0.8.0.1
* Version information:
- pak platform: x86_64-w64-mingw32 (current: x86_64-w64-mingw32, compatible)
- pak repository: - (local install?)
* Optional packages installed:
- pillar
* Library path:
- G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32
- C:/Program Files/R/R-4.4.1/library
* pak is installed at G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32/pak.
* Dependency versions:
- callr      3.7.6      
- cli        3.6.2      
- curl       5.2.1      
- desc       1.4.3      
- filelock   1.0.3      
- jsonlite   1.8.8      
- lpSolve    5.6.23.9000
- pkgbuild   1.4.4      
- pkgcache   2.2.2.9000 
- pkgdepends 0.7.2.9000 
- pkgsearch  3.1.3.9000 
- processx   3.8.4      
- ps         1.7.6      
- R6         2.5.1      
- zip        2.3.1      
* Dependencies can be loaded
Install IrregLong
Install geepack
[1] Inf


Database consolidation

We used SISTRAT23_c1_2010_2024_df_prev1w instead of v16, because we wanted to capture more treatments from each people.

Code
invisible("Label variables")
# main=========
# =========================
# Core variables
# =========================
# Biopsychosocial compromise = biopsych_comp / severe = biopsych_comp_severe #created
# Age at Admission to Treatment (First Entry) = adm_age_rec3
# Birth year = birth_date_rec / year_decimal  #created
# Primary substance of the initial diagnosis = sus_ini_mod_mvv
# Psychiatric comorbidity (ICD-10) = dg_psiq_cie_10_dg
# Psychiatric comorbidity (ICD-10) [In study] = dg_psiq_cie_10_instudy
# Broad diagnostic categories (e.g., Anxiety, Personality) = cie10_broad_cat
# Frequency of primary substance use at admission = prim_sub_freq
# Daily = created= prim_sub_daily
# Occupational status (corrected) = occupation_condition_corr24   # created
# Primary substance at admission to treatment = primary_sub
# Polysubstance = polysubstance_strict   # created

# =========================
# Survival variables
# =========================
# Discharge date (numeric, rec6) = disch_date_num_rec6
# Discharge date (numeric, rec6, imputed/translated) = disch_date_num_rec6_trans
# Admission date of next treatment = adm_date_next_treat
# Admission date (numeric, rec2) = adm_date_num_rec2
# Treatment compliance (rec7) = tr_compliance_rec7   # created

# =========================
# Effect modifier
# =========================
# Plan type= plan_type_mod # created

# =========================
# Process variables
# =========================
# Treatment outcome of the previous treatment= tr_completion # created
# Previous biopsychosocial compromise (severe)
# Previous treatment duration (<90 days)
# Previous treatment duration (log days)
# Polysubstance use status of the previous treatment
# Age at admission to treatment (previous)
# Birth year (previous)
# Primary substance (initial diagnosis): alcohol = primary_sub
# Primary substance (initial diagnosis): cocaine
# Primary substance (initial diagnosis): cocaine base paste
# Primary substance (initial diagnosis): marijuana
# Psychiatric comorbidity (diagnosis unknown / under study)
# Psychiatric comorbidity (confirmed comorbidity)
# Daily frequency of primary substance at admission
# Occupational status (inactive) = occupation_condition_corr   # created
# Occupational status (unemployed) = occupation_condition_corr   # created
# Primary substance at admission to treatment (alcohol)
# Primary substance at admission to treatment (cocaine hydrochloride)
# Primary substance at admission to treatment (cocaine base paste)
# Primary substance at admission to treatment (marijuana)

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# adapt database=========
sec_cols <- c("second_sub1", "second_sub2", "second_sub3")

out <- SISTRAT23_c1_2010_2024_df_prev1w %>%
  left_join(SISTRAT23_c1_2010_2024_df_prev1t[,c("rn", "birth_date_rec", "dg_psiq_cie_10_instudy", "dg_psiq_cie_10_dg")], by="rn", multiple="first")%>%
  arrange(hash_key, adm_age_rec3)%>%
  mutate(
    # Clean all substance columns
    across(all_of(c("primary_sub", sec_cols)), ~ .x %>% stringr::str_squish() %>% stringr::str_to_lower()),
    
    # Count real secondary substances (non-NA, non-empty, not "sin sustancia...")
    n_real_secondary = rowSums(
      across(all_of(sec_cols), ~ 
        !is.na(.x) & 
        .x != "" & 
        !stringr::str_detect(.x, "sin\\s*sustancia\\s*principal")
      ),
      na.rm = TRUE
    ),
    # Define polysubstance logic:
    # Case 1: primary_sub is NA → polysubstance if ≥2 real secondaries
    # Case 2: primary_sub NOT NA → polysubstance if any secondary ≠ primary_sub
    has_real_secondary = case_when(
      is.na(primary_sub) ~ n_real_secondary >= 2,
      TRUE ~ if_any(
        all_of(sec_cols),
        ~ !is.na(.x) & 
          .x != "" & 
          !stringr::str_detect(.x, "sin\\s*sustancia\\s*principal") & 
          .x != primary_sub
      )
    ),
    polysubstance_strict = as.integer(has_real_secondary)
  )%>%  
  mutate(
    plan_code_std = plan_type_corr %>%
      stringr::str_trim() %>% stringr::str_to_lower() %>% stringr::str_replace_all("[ _]", "-"),

    plan_sponsor = case_when(
      stringr::str_detect(plan_code_std, "^m-")  ~ "WO",
      stringr::str_detect(plan_code_std, "^pg-") ~ "GP",
      TRUE ~ NA_character_
    ),
    plan_level = case_when(
      stringr::str_detect(plan_code_std, "pr$")  ~ "residential",
      stringr::str_detect(plan_code_std, "pai$") ~ "intensive ambulatory",
      stringr::str_detect(plan_code_std, "pab$") ~ "basic ambulatory",
      TRUE ~ NA_character_
    ),

    # Optional combined label if you still want one
    plan_type_mod = case_when(
      !is.na(plan_sponsor) & !is.na(plan_level) ~ paste(plan_sponsor, plan_level),
      !is.na(plan_level)                        ~ plan_level,
      TRUE                                      ~ NA_character_
    ),
    plan_type_mod = case_when(plan_type_mod=="WO basic ambulatory"~ "WO intensive ambulatory",
      TRUE                                      ~ plan_type_mod
    )
  )%>%
  group_by(hash_key)%>%
    mutate(plan_type_mod = first(plan_type_mod))%>%
  ungroup()%>%
  mutate(
    prim_sub_daily = case_when(
      is.na(prim_sub_freq) ~ NA_integer_,
      prim_sub_freq == "5. Daily"   ~ 1L,
      TRUE                          ~ 0L
    )
  )%>%
  mutate(
    tr_completion = case_when(
      tr_compliance_rec7 == "completion"        ~ 1L,
      stringr::str_detect(tr_compliance_rec7, "current") ~ NA_integer_,
      TRUE                           ~ 0L
    ),
    # Non-completion (1) vs completion (0); keep NA for "currently in"
    tr_noncompletion = case_when(
      is.na(tr_completion) ~ NA_integer_,
      tr_completion == 1L  ~ 0L,
      TRUE                 ~ 1L
    )
  )%>%
  mutate(
          primary_sub_std = stringr::str_to_lower(stringr::str_squish(primary_sub)),
          primary_sub_mod = case_when(
              stringr::str_detect(primary_sub_std, "cocaine\\s*(base\\s*)?paste|pasta\\s*base|\\bpaco\\b") ~ "cocaine paste",
              stringr::str_detect(primary_sub_std, "cocaine\\s*(powder|hydrochloride|hcl)") |
                  stringr::str_detect(primary_sub_std, "clorhidrato\\s*de\\s*coca|clorhidrato|hidrocloruro") ~ "cocaine powder",
              stringr::str_detect(primary_sub_std, "\\balcohol\\b") ~ "alcohol",
              stringr::str_detect(primary_sub_std, "marij|cannab|marihu") ~ "marijuana",
              TRUE ~ "others"
          )) %>%
      mutate(primary_sub_mod = factor(primary_sub_mod,
                                       levels = c("cocaine paste","cocaine powder","alcohol","marijuana","others"))) %>%
  mutate(treat_log_days = if_else(!is.na(dit_rec6), log1p(dit_rec6), NA_real_),
         treat_lt_90   = if_else(!is.na(dit_rec6), as.integer(dit_rec6 < 90), NA_integer_))%>%# log(1+days)
  mutate(biopsych_comp_severe = case_when(
      biopsych_comp == "3-severe"   ~ 1L,
      biopsych_comp %in% c("1-mild", "2-moderate") ~ 0L,
      TRUE ~ NA_integer_))%>%   # keep NA if any missing)
  mutate(dg_psiq_cie_10_instudy= if_else(dg_psiq_cie_10_dg==TRUE, FALSE, dg_psiq_cie_10_instudy))%>%
  select(-any_of(c("primary_sub_std", "plan_code_std", "plan_sponsor", "plan_level", "n_real_secondary", "has_real_secondary", "tr_completion", "inconsistency_pair", "change_reason", "status_is_working", "occupation_condition_std", "occupation_status_std", "status_is_working", "ed_attainment_aux", "obs")))
  
# 2) Labels (these will be picked up by packages like table1/gtsummary; TableOne itself doesn’t use labels)
attr(out$tr_noncompletion,          "label") <- "Non-completion status of treatment (Dropout / Misspelled)"
attr(out$biopsych_comp_severe,     "label") <- "Biopsychosocial compromise (Severe)"
attr(out$treat_lt_90,               "label") <- "Treatment duration (binary) (<90 days)"
attr(out$treat_log_days,            "label") <- "Treatment duration (log-scaled days)"
attr(out$adm_age_rec3,              "label") <- "Age at admission to treatment"
attr(out$birth_date_rec,            "label") <- "Birth year"

# Primary substance (initial diagnosis) – label the variable; levels print as rows
attr(out$sus_ini_mod_mvv,           "label") <- "Primary substance (initial diagnosis)"

# Psychiatric comorbidity
attr(out$dg_psiq_cie_10_instudy,    "label") <- "Psychiatric comorbidity (ICD-10): In study"
attr(out$dg_psiq_cie_10_dg,         "label") <- "Psychiatric comorbidity (ICD-10): Diagnosis"

# Frequency / Daily
attr(out$prim_sub_daily,            "label") <- "Daily frequency of primary substance used at admission"

# Occupational status (levels like “Inactive”, “Unemployed” will print under this)
attr(out$occupation_condition_corr24,"label") <- "Occupational Status"

# Primary substance at admission – label the variable; levels print as rows
attr(out$primary_sub_mod,               "label") <- "Primary substance at admission to treatment"


# Optional: quick audits mirroring your Excel pivots
audit1 <- out %>%
  count(occupation_status_corr, occupation_condition_corr, name = "n") %>%
  arrange(occupation_condition_corr, desc(n))

# Inactive/unemployed rows no longer carry a working status.
# Employed rows all have a status (with “other” as the fallback when missing).
out$year_decimal <- lubridate::year(out$birth_date_rec) +
  (lubridate::yday(out$birth_date_rec) - 1) / 365.25

try(out$occupation_condition_inferred <- NULL)

# To keep the corrected data frame in the Global Environment as a named object:
assign("SISTRAT23_c1_2010_2024_df_prev1w_corr", out, envir = .GlobalEnv)

rm(out)

invisible("to explore")
# SISTRAT23_c1_2010_2024_df_prev1w_corr[, c("hash_key", "adm_age_rec3", "disch_date_num_rec6", "disch_date_num_rec6_trans", "adm_date_num_rec2", "tr_completion", "polysubstance_strict", "adm_disch_reason")]|> head(20)|> dput()

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# DISCARD DATA
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# ------------------------------------------------------------
# 0) Canonical ordering within each patient (hash_key)
#    Primary sort: admission date; tie-breaker: transformed discharge date
# ------------------------------------------------------------
SISTRAT23_c1_2010_2024_df_prev1w_corr_ord <-
  SISTRAT23_c1_2010_2024_df_prev1w_corr|>
  tidytable::ungroup()|> 
  tidytable::arrange(hash_key, adm_age_rec3, disch_date_num_rec6_trans)

# ------------------------------------------------------------
# 1) Mark first treatment per hash_key (1 = first treatment, 0 = otherwise)
# ------------------------------------------------------------
SISTRAT23_c1_2010_2024_df_prev1w_corr_marked <-
  SISTRAT23_c1_2010_2024_df_prev1w_corr|>
  tidytable::group_by(hash_key)|> 
  tidytable::mutate(
    initial_treatment = as.integer(tidytable::row_number() == 1L),
    episodes= tidytable::n(),
  )|> 
  tidytable::ungroup()

# ------------------------------------------------------------
# 2a) Drop the ENTIRE PATIENT if their FIRST treatment has:
#     - adm_disch_reason == "death"  OR
#     - tr_compliance_rec7 == "currently in"
# ------------------------------------------------------------
first_flags <-
  SISTRAT23_c1_2010_2024_df_prev1w_corr_marked|>
  tidytable::mutate(
    discard =
      grepl("^death$", as.character(adm_disch_reason), ignore.case = TRUE) |
      grepl("^currently\\s*in$", as.character(tr_compliance_rec7), ignore.case = TRUE)
  )|>
  tidytable::select(rn, discard)

message(paste0("Any treatment w/ death or ongoing tr: ", table(first_flags$discard)[[2]]))

Any treatment w/ death or ongoing tr: 3792

Code
SISTRAT_clean_drop_patient <-
  SISTRAT23_c1_2010_2024_df_prev1w_corr_marked|>
  tidytable::left_join(first_flags, by = "rn")|>
  tidytable::filter(!tidytable::coalesce(discard, FALSE))|>
  tidytable::select(-discard)

# ------------------------------------------------------------
# 3) Drop the ENTIRE PATIENT if only one treatment
# ------------------------------------------------------------

SISTRAT_clean_drop_patient_multiple<-
SISTRAT_clean_drop_patient|>
  arrange(hash_key, adm_age_rec3)|>
  #2025-09-22: count again
  tidytable::group_by(hash_key)|> 
  tidytable::mutate(
    initial_treatment = as.integer(tidytable::row_number() == 1L),
    episodes= tidytable::n(),
  )|> 
  tidytable::ungroup()|> 
      (\(df) {
        (message(paste0("Only one tr: ", filter(df, episodes==1)|>pull(hash_key)|> length())))
        filter(df, episodes==1)|>pull(hash_key)|> length() ->> hash_only_one_tr
        df
    })()|>
  filter(episodes>1)|> 
  (\(df) {
    (message(paste0("Multiple treatments, Entries: ", nrow(df))))
    (message(paste0("Multiple treatments, RUNs: ", distinct(df, hash_key)|> nrow())))
    nrow(df) ->> rows_final_db
    distinct(df, hash_key)|> nrow() ->> hashs_final_db
    df
  })()

Only one tr: 91149

Multiple treatments, Entries: 67957

Multiple treatments, RUNs: 27445

Code
#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_
invisible("Function to format CreateTableOne into a database")

as.data.frame.TableOne <- function(x, ...) {capture.output(print(x,
                          showAllLevels = TRUE, varLabels = T,...) -> x)
  y <- as.data.frame(x)
  y$characteristic <- dplyr::na_if(rownames(x), "")
  y <- y %>%
  fill(characteristic, .direction = "down") %>%
  dplyr::select(characteristic, everything())
  rownames(y) <- NULL
  y}
#_#_#_#_#_#_#_#_#_#_#_#_#_

# Pick the strata variable automatically
strata_var <- if ("polysubstance_strict" %in% names(SISTRAT_clean_drop_patient_multiple)) "polysubstance_strict" else "polysubstance"

# Variables of interest (as you listed)
vars_interest <- c(
  "tr_noncompletion",
  "biopsych_comp_severe",          # Biopsychosocial compromise
  "treat_lt_90",
  "treat_log_days",
  "adm_age_rec3",                  # Age at Admission (First Entry)
  "year_decimal",                # Birth year (assumed numeric or year-like)
  "sus_ini_mod_mvv",               # Primary substance of the initial diagnosis
  "dg_psiq_cie_10_dg",             # Psychiatric comorbidity (ICD-10)
  "dg_psiq_cie_10_instudy",        # Psychiatric comorbidity (ICD-10) [In study]
  #"cie10_broad_cat",               # Broad diagnostic categories
  #"prim_sub_freq",                 # Frequency of primary substance use at admission
  "prim_sub_daily",                # Daily (0/1 you created)
  "occupation_condition_corr24",   # Occupational status (corrected)
  "primary_sub_mod"                    # Primary substance at admission to treatment
  )

# Which of those should be treated as categorical in TableOne
factorVars_interest <- c(
  "biopsych_comp_severe",
  "sus_ini_mod_mvv",
  "dg_psiq_cie_10_dg",
  "dg_psiq_cie_10_instudy",
  #"cie10_broad_cat",
  #"prim_sub_freq",
  "prim_sub_daily",               # treat 0/1 as categorical to show counts/%
  "occupation_condition_corr24",
  "primary_sub_mod", 
  "tr_noncompletion",
  "treat_lt_90"
)


# 2) Labels (these will be picked up by packages like table1/gtsummary; TableOne itself doesn’t use labels)
attr(SISTRAT_clean_drop_patient_multiple$tr_noncompletion,          "label") <- "Non-completion status of treatment (Dropout / Misspelled)"
attr(SISTRAT_clean_drop_patient_multiple$biopsych_comp_severe,             "label") <- "Biopsychosocial compromise (Severe)"
attr(SISTRAT_clean_drop_patient_multiple$treat_lt_90,               "label") <- "Treatment duration (binary) (<90 days)"
attr(SISTRAT_clean_drop_patient_multiple$treat_log_days,            "label") <- "Treatment duration (log-scaled days)"
attr(SISTRAT_clean_drop_patient_multiple$adm_age_rec3,              "label") <- "Age at admission to treatment"
attr(SISTRAT_clean_drop_patient_multiple$year_decimal,            "label") <- "Birth year"

# Primary substance (initial diagnosis) – label the variable; levels print as rows
attr(SISTRAT_clean_drop_patient_multiple$sus_ini_mod_mvv,           "label") <- "Primary substance (initial diagnosis)"

# Psychiatric comorbidity
attr(SISTRAT_clean_drop_patient_multiple$dg_psiq_cie_10_instudy,    "label") <- "Psychiatric comorbidity (ICD-10): In study"
attr(SISTRAT_clean_drop_patient_multiple$dg_psiq_cie_10_dg,         "label") <- "Psychiatric comorbidity (ICD-10): Diagnosis"

# Frequency / Daily
attr(SISTRAT_clean_drop_patient_multiple$prim_sub_daily,            "label") <- "Daily frequency of primary substance used at admission"

# Occupational status (levels like “Inactive”, “Unemployed” will print under this)
attr(SISTRAT_clean_drop_patient_multiple$occupation_condition_corr24,"label") <- "Occupational Status"

# Primary substance at admission – label the variable; levels print as rows
attr(SISTRAT_clean_drop_patient_multiple$primary_sub_mod,               "label") <- "Primary substance at admission to treatment"

# Optional: if birth_date_rec is a Date, turn into year (uncomment if needed)
# if (inherits(df$birth_date_rec, "Date")) {
#   df <- df %>% mutate(birth_date_rec = as.integer(format(birth_date_rec, "%Y")))
# }


tbone_interest <- tableone::CreateTableOne(
  vars       = vars_interest,
  data       =   tidytable::slice_head(SISTRAT_clean_drop_patient_multiple,n = 1, .by = hash_key)|> select(all_of(c(vars_interest, "polysubstance_strict"))),
  factorVars = factorVars_interest,
  strata     = strata_var,
  addOverall = TRUE,
  includeNA  = TRUE,
  smd        = TRUE,
  test       = TRUE
)

# Make a composite key characteristic+level for ordering
df_tbone <- as.data.frame.TableOne(
  tbone_interest,
  nonnormal = c("adm_age_rec3", "year_decimal", "treat_log_days"),
  smd = TRUE
)

# Build desired order as a tibble with both characteristic and level
order_df <- tibble::tribble(
  ~characteristic, ~level,
  "n", "",
  "Non-completion status of treatment (Dropout / Misspelled) (%)", "0",
  "Non-completion status of treatment (Dropout / Misspelled) (%)", "1",
  "Biopsychosocial compromise (Severe) (%)", "0",
  "Biopsychosocial compromise (Severe) (%)", "1",
  "Biopsychosocial compromise (Severe) (%)", NA,
  "Treatment duration (binary) (<90 days) (%)", "0",
  "Treatment duration (binary) (<90 days) (%)", "1",
  "Treatment duration (log-scaled days) (median [IQR])", "",
  "Age at admission to treatment (median [IQR])", "",
  "Birth year (median [IQR])", "",
  "Primary substance (initial diagnosis) (%)", "cocaine powder",
  "Primary substance (initial diagnosis) (%)", "cocaine paste",
  "Primary substance (initial diagnosis) (%)", "marijuana",
  "Primary substance (initial diagnosis) (%)", "alcohol",
  "Primary substance (initial diagnosis) (%)", "others",
  "Primary substance (initial diagnosis) (%)", NA,
  "Psychiatric comorbidity (ICD-10): In study (%)", "TRUE",
  "Psychiatric comorbidity (ICD-10): In study (%)", "FALSE",
  "Psychiatric comorbidity (ICD-10): Diagnosis (%)", "TRUE",
  "Daily frequency of primary substance used at admission (%)", "0",
  "Daily frequency of primary substance used at admission (%)", "1",
  "Daily frequency of primary substance used at admission (%)", NA,
  "Occupational Status (%)", "inactive",
  "Occupational Status (%)", "unemployed",
  "Occupational Status (%)", "employed",
  "Primary substance at admission to treatment (%)", "cocaine powder",
  "Primary substance at admission to treatment (%)", "cocaine paste",
  "Primary substance at admission to treatment (%)", "marijuana",
  "Primary substance at admission to treatment (%)", "alcohol",
  "Primary substance at admission to treatment (%)", "others"
)

# Join to assign sort order
df_tbone_ordered <- df_tbone %>%
  left_join(order_df %>% mutate(order_id = dplyr::row_number()),
            by = c("characteristic", "level")) %>%
  arrange(order_id) %>%
  select(-order_id)

df_tbone_ordered|> 
  filter(level!=0, level!=FALSE)|>
  mutate(level=gsub("powder", "hydrochloride", level))|> 
  mutate(
      Overall = gsub("\\(\\s+", "(", Overall),  
      `0` = gsub("\\(\\s+", "(", `0`),
      `1` = gsub("\\(\\s+", "(", `1`)
  )|>
  select(characteristic, level, `0`, `1`, Overall, SMD)|>
  tibble::as_tibble()|> 
  (\(df) {
      format_cells(df,1, 1:length(names(df)), "bold")
  })() |> 
  knitr::kable("markdown", "Descriptives")
characteristic level 0 1 Overall SMD
n 5956 21489 27445
Non-completion status of treatment (Dropout / Misspelled) (%) 1 4323 (72.6) 16909 (78.7) 21232 (77.4)
Biopsychosocial compromise (Severe) (%) 1 1520 (25.5) 8291 (38.6) 9811 (35.7)
Treatment duration (binary) (<90 days) (%) 1 1196 (20.1) 5329 (24.8) 6525 (23.8)
Treatment duration (log-scaled days) (median [IQR]) 5.25 [4.65, 5.80] 5.13 [4.51, 5.70] 5.16 [4.53, 5.72] 0.146
Age at admission to treatment (median [IQR]) 38.56 [30.44, 47.58] 31.66 [26.12, 38.65] 32.82 [26.76, 40.69] 0.625
Birth year (median [IQR]) 1977.48 [1968.61, 1985.57] 1982.91 [1975.73, 1988.85] 1982.07 [1974.10, 1988.34] 0.492
Primary substance (initial diagnosis) (%) cocaine hydrochloride 492 (8.3) 1681 (7.8) 2173 (7.9)
Primary substance (initial diagnosis) (%) cocaine paste 789 (13.2) 2312 (10.8) 3101 (11.3) 0.267
Primary substance (initial diagnosis) (%) marijuana 533 (8.9) 3766 (17.5) 4299 (15.7)
Primary substance (initial diagnosis) (%) alcohol 4064 (68.2) 13428 (62.5) 17492 (63.7)
Primary substance (initial diagnosis) (%) others 41 (0.7) 81 (0.4) 122 (0.4)
Psychiatric comorbidity (ICD-10): In study (%) TRUE 862 (14.5) 3903 (18.2) 4765 (17.4)
Psychiatric comorbidity (ICD-10): Diagnosis (%) TRUE 2638 (44.3) 10148 (47.2) 12786 (46.6)
Daily frequency of primary substance used at admission (%) 1 2561 (43.0) 10377 (48.3) 12938 (47.1)
Occupational Status (%) inactive 1240 (20.8) 4214 (19.6) 5454 (19.9)
Occupational Status (%) unemployed 1758 (29.5) 8263 (38.5) 10021 (36.5)
Occupational Status (%) employed 2958 (49.7) 9012 (41.9) 11970 (43.6) 0.194
Primary substance at admission to treatment (%) cocaine hydrochloride 810 (13.6) 4789 (22.3) 5599 (20.4)
Primary substance at admission to treatment (%) cocaine paste 1720 (28.9) 10507 (48.9) 12227 (44.6) 0.710
Primary substance at admission to treatment (%) marijuana 189 (3.2) 1383 (6.4) 1572 (5.7)
Primary substance at admission to treatment (%) alcohol 3138 (52.7) 4435 (20.6) 7573 (27.6)
Primary substance at admission to treatment (%) others 99 (1.7) 375 (1.7) 474 (1.7)
Code
# Labels
lab_orig <- paste0(
  "Original C1 Dataset \n(n = ",
  formatC(nrow(df23_ndp_20250824_SISTRAT23_c1_1024), format = "f", big.mark = ",", digits = 0),
  "; Patients = ",
  formatC(dplyr::n_distinct(df23_ndp_20250824_SISTRAT23_c1_1024$hash_key), format = "f", big.mark = ",", digits = 0),
  ")"
)

lab_orig_clean <- paste(
  "Pre-processing & Quality Checks",
  "&#8226; Remove exact duplicate records",
  "&#8226; Resolve overlapping episodes (keep longest / merge)",
  "&#8226; Fix inconsistent admission/discharge dates",
  "&#8226; Correct implausible birth dates/ages",
  "&#8226; Remove negative/implausible days in treatment",
"",
  sep = "\\l"
)
lab_post_orig  <- paste0('C1 Dataset \n(n = ', formatC(nrow(SISTRAT23_c1_2010_2024_df_prev1t), format='f', big.mark=',', digits=0), '; Patients = ',formatC(dplyr::distinct(SISTRAT23_c1_2010_2024_df_prev1t, hash_key)|> nrow(), format='f', big.mark=',', digits=0),')')
lab_post_orig_clean <- paste("&#8226; Collapse consecutive linked", "treatments (≤45d gap, referral=yes)", sep = "\\l")

lab_proc  <- paste0('C1 Cleaned & Collapsed\n(n = ', formatC(nrow(SISTRAT23_c1_2010_2024_df_prev1w), format='f', big.mark=',', digits=0), '; Patients = ',formatC(dplyr::distinct(SISTRAT23_c1_2010_2024_df_prev1w, hash_key)|> nrow(), format='f', big.mark=',', digits=0),')')
lab_flag  <- "Sorted by Admission Date\n+ First Treatment Flag"
lab_after <- "After First-Treatment Rule\nn = 169,936; Patients = 117,507"  # example counts
lab_final <- paste0("Final C1 Dataset\nn = ",
                    formatC(rows_final_db, format = "f", digits = 0, big.mark = ","),
                    "; Patients = ",
                    formatC(hashs_final_db, format = "f", digits = 0, big.mark = ","))

lab_discard_first <- paste(
  "Discard treatment episodes if:",
  "&#8226; Administrative discharge reason= \\\"death\\\"",
  "&#8226; Tr. compliance status = \\\"currently in\\\"",
  "",
  paste0("n = ", formatC(table(first_flags$discard)[[2]], format = "f", digits = 0, big.mark = ",")," put aside"),
"",
  sep = "\\l"
)

lab_discard_single <- paste0(
  "Discard patients with only ONE treatment\n",
  "Patients = ", formatC(hash_only_one_tr, format = "f", big.mark = ",", digits = 0)
)
# Diagram
gr <- DiagrammeR::grViz(
    paste0(
        'digraph flowchart {
      graph [layout = dot, rankdir = TB]

      node [fontname = "Times", shape = rectangle, fontsize = 15, style = filled, fillcolor = white, ranksep=0.25, nodesep=0.25]

      # New initial box and pre-clean note
      pre_original   [label = "', lab_orig, '", fillcolor = lightgray, shape = box]
      pre_clean      [label = "', lab_orig_clean, '", shape = note, fillcolor = white]

      # Intermediate new initial box and pre-clean note
      pre2_original   [label = "', lab_post_orig, '", fillcolor = white, shape = box]
      pre2_clean      [label = "', lab_post_orig_clean, '", shape = note, fillcolor = lemonchiffon]

      # Existing boxes
      original       [label = "', lab_proc, '", fillcolor = lightgray, shape = box]
      marked         [label = "', lab_flag, '", shape = box]
      after_rule     [label = "', lab_after, '", shape = box]
      final_dataset  [label = "', lab_final, '", fillcolor = lightgray, shape = box]

      discard_first   [label = "', lab_discard_first, '", shape = note, fillcolor = mistyrose]
      discard_single  [label = "', lab_discard_single, '", shape = note, fillcolor = mistyrose]

      # Invisible points for alignment
      v00 [shape = point, width = 0, style = invis]  # between orig and post orig
      v0 [shape = point, width = 0, style = invis]  # between pre_original and original      
      v1 [shape = point, width = 0, style = invis]
      v2 [shape = point, width = 0, style = invis]
      v3 [shape = point, width = 0, style = invis]

      # Main flow (add pre_original -> v0 -> original in front)
      pre_original -> v00 [arrowhead = none]
      v00 -> pre2_original
      
      pre2_original -> v0 [arrowhead = none]
      v0 -> original      

      original -> v1 [arrowhead = none]
      v1 -> marked
      marked -> v2 [arrowhead = none]
      v2 -> after_rule
      after_rule -> v3 [arrowhead = none]
      v3 -> final_dataset

      # Discard / note connections with solid black arrows
      v00 -> pre_clean     [color = black]     # new pre-processing note
      v0 -> pre2_clean     [color = black]     # new pre-processing note
      v2 -> discard_first [color = black]
      v3 -> discard_single[color = black]

      # Alignment
      { rank = same; pre_clean; v00 }
      { rank = same; pre2_clean; v0 }
      { rank = same; discard_first; v2 }
      { rank = same; discard_single; v3 }
    }'
    ),
  width = 600, height = 900
)

gr

Figure 3. Flowchart

Code
unlink(paste0(gsub("/cons","",getwd()),"/cons/_figs/_flowchart_psu_files"), recursive = TRUE)
htmlwidgets::saveWidget(gr, paste0(gsub("/cons","",getwd()),"/cons/_figs/_flowchart_psu.html"))
webshot::webshot(paste0(gsub("/cons","",getwd()),"/cons/_figs/_flowchart_psu.html"), 
                 paste0(gsub("/cons","",getwd()),"/cons/_figs/_flowchart_psu.png"),
                 vwidth = 300, vheight = 300*1.5,  zoom=10, expand=100)  # Prueba con diferentes coordenadas top, left, width, and height

Registered S3 methods overwritten by ‘callr’: method from format.callr_status_error
print.callr_status_error

We compared the characteristics of people who used multiple substances and those who did not in the total database between 2010 and 2024.

Missingness

Code
paste0(
  "Percentage of the total that has missing values: ",
  scales::percent(
    1 - (
      SISTRAT_clean_drop_patient_multiple[
        which(
          complete.cases(
            subset(
              SISTRAT_clean_drop_patient_multiple,
              select = c(vars_interest, "polysubstance_strict", "plan_type_mod")
            )
          )
        ),
      ] %>% nrow() / nrow(SISTRAT_clean_drop_patient_multiple)
    ),
    accuracy = 0.1
  ),
  "; total = ",
  format(nrow(SISTRAT_clean_drop_patient_multiple), big.mark = ",")
)
#[1] "Percentage of the total that has missing values: 84.4%; total= 90,075"
# 76029

#SEP 2025
#[1] "Percentage of the total that has missing values: 2.4%; total = 83,174"

# Variables to check
vars_check <- c(vars_interest, "polysubstance_strict", "plan_type_mod")

# Compute % missing per variable
missing_pct <- colMeans(
  is.na(subset(SISTRAT_clean_drop_patient_multiple, select = vars_check))
) * 100
# Round and format into data.frame
df_missing <- data.frame(
  Variable = names(missing_pct),
  Missing_Percent = round(missing_pct, 2)
)
# Display as markdown table
knitr::kable(df_missing, format = "markdown", align = c("l","r"),
             col.names = c("Variable", "% Missing"))
[1] "Percentage of the total that has missing values: 2.3%; total = 67,957"
Variable % Missing
tr_noncompletion tr_noncompletion 0.00
biopsych_comp_severe biopsych_comp_severe 1.16
treat_lt_90 treat_lt_90 0.00
treat_log_days treat_log_days 0.00
adm_age_rec3 adm_age_rec3 0.00
year_decimal year_decimal 0.00
sus_ini_mod_mvv sus_ini_mod_mvv 0.69
dg_psiq_cie_10_dg dg_psiq_cie_10_dg 0.00
dg_psiq_cie_10_instudy dg_psiq_cie_10_instudy 0.00
prim_sub_daily prim_sub_daily 0.51
occupation_condition_corr24 occupation_condition_corr24 0.00
primary_sub_mod primary_sub_mod 0.00
polysubstance_strict polysubstance_strict 0.00
plan_type_mod plan_type_mod 0.00
Code
set.seed(2125)

# -----------------------------
# 0) Targets (ONLY these will be imputed)
#    <- you already have `vars_interest`; we add the two extra vars you asked for
# -----------------------------
vars_impute <- unique(c(vars_interest, "polysubstance_strict", "plan_type_mod"))

# Keep only variables that actually exist to avoid errors
vars_impute <- intersect(vars_impute, names(SISTRAT_clean_drop_patient_multiple))
if (length(vars_impute) == 0) stop("None of the vars in vars_interest/polysubstance_strict/plan_type_mod exist in the data.")

# -----------------------------
# 1) Predictors (you can expand/trim this list as you prefer)
#    Tip: include easy, generally-available covariates to make MAR more plausible
# -----------------------------
predictor_pool <- c(
  # strong helpers already in your dataset
  "adm_age_rec3", "year_decimal", "treat_log_days", "treat_lt_90",
  "biopsych_comp_severe", "tr_noncompletion", "prim_sub_daily",
  "primary_sub_mod", "occupation_condition_corr24",
  "pub_center", "type_center", "episodes",
  "sex_rec", "plan_type_mod", "primary_sub_std", "tr_completion",
  "dg_psiq_cie_10_dg", "dg_psiq_cie_10_instudy", 
  "episodes", "age_subs_onset_rec2", "age_primary_onset_rec2",
  "tenure_status_household", "macrozone_center", "adm_motive",
  "marital_status", "ed_attainment", "sub_dep_icd10_status", "pub_center", "senda", "num_trat_ant", 
  "fecha_ultimo_tratamiento"
)

# Use only columns that exist
predictor_pool <- intersect(unique(c(predictor_pool, vars_impute)), names(SISTRAT_clean_drop_patient_multiple))

# Make sure we still have something on the RHS
if (length(predictor_pool) == 0) stop("No predictors found in the data. Please add some predictor names that exist.")

# -----------------------------
# 2) Build formula: (targets) ~ (predictors)
#    missRanger will impute ONLY the LHS variables
# -----------------------------
lhs <- paste(vars_impute, collapse = " + ")
rhs <- paste(predictor_pool, collapse = " + ")
form_miss <- as.formula(paste(lhs, "~", rhs))

# -----------------------------
# 3) Arrange for reproducibility and run missRanger
# -----------------------------
dat_ord <- dplyr::arrange(SISTRAT_clean_drop_patient_multiple, hash_key, adm_date_num_rec2)

# columns to pass to missRanger = targets (impute) + predictors
cols_mr <- unique(c(vars_impute, predictor_pool))

# subset safely whether dat_ord is data.table/tidytable or data.frame
if (inherits(dat_ord, "data.table")) {
    dat_sub <- dat_ord[, ..cols_mr]
} else {
    # fallback for data.frame / tibble
    dat_sub <- dat_ord[, cols_mr, drop = FALSE]
}

# (optional) force to plain data.frame to avoid method dispatch surprises
dat_sub <- as.data.frame(dat_sub)

SISTRAT_imputed_only <- missRanger::missRanger(
    data      = dat_sub,         # << use the safe subset
    formula   = form_miss,       # ONLY LHS vars in vars_impute will be imputed
    num.trees = 100,
    pmm.k     = 5,
    maxiter   = 50,
    returnOOB = TRUE,
    verbose   = 2,
    seed      = 2125
)

Missing value imputation by random forests


Variables to impute:        prim_sub_daily, sus_ini_mod_mvv, biopsych_comp_severe
Variables used to impute:   adm_age_rec3, year_decimal, treat_log_days, treat_lt_90, biopsych_comp_severe, tr_noncompletion, prim_sub_daily, primary_sub_mod, occupation_condition_corr24, episodes, sex_rec, plan_type_mod, dg_psiq_cie_10_dg, dg_psiq_cie_10_instudy, adm_motive, sub_dep_icd10_status, senda, sus_ini_mod_mvv, polysubstance_strict

    prm_s_  ss_n__  bpsy__
iter 1: 0.8733  0.3432  0.7810  
iter 2: 0.8394  0.3438  0.7789  
iter 3: 0.8397  0.3444  0.7813  
Code
# -----------------------------
# 4) Write imputed values back to your full data
# -----------------------------
# Make safe copies as data.frame
SISTRAT_clean_drop_patient_multiple_imp <- as.data.frame(dat_ord)
SISTRAT_imputed_only_df <- as.data.frame(SISTRAT_imputed_only)

# Regular data.frame column replacement works with a character vector of names
SISTRAT_clean_drop_patient_multiple_imp[vars_impute] <- SISTRAT_imputed_only_df[vars_impute]

# -----------------------------
# 5) OOB error summary → kable
#    (missRanger stores per-target OOB error in the "oob" attribute)
# -----------------------------
oob_vec <- attr(SISTRAT_imputed_only, "oob")
if (!is.null(oob_vec)) {
  df_oob <- data.frame(
    Variable = names(oob_vec),
    OOB_Error = round(as.numeric(oob_vec),3)
  )
  df_oob <- df_oob[order(df_oob$Variable), , drop = FALSE]
  knitr::kable(df_oob, format = "markdown", align = c("l","r"),
               col.names = c("Variable", "OOB error (1 - R² or class error)"))
} else {
  message("No OOB error attribute returned by missRanger.")
}
Variable OOB error (1 - R² or class error)
3 biopsych_comp_severe 0.779
1 prim_sub_daily 0.839
2 sus_ini_mod_mvv 0.344
Code
# |   |Variable             | OOB error (1 - R² or class error)|
# |:--|:--------------------|---------------------------------:|
# |4  |biopsych_comp_severe |                             0.688|
# |1  |polysubstance_strict |                             0.790|
# |2  |prim_sub_daily       |                             0.814|
# |3  |sus_ini_mod_mvv      |                             0.338|
#<0.2 good; 0.2–0.4 fair; 0.4–0.6 poor; >0.6 very poor for classification-type targets.
#OOB prediction error per iteration and variable (1 minus R-squared for regression)

#The default mtry in missRanger is sqrt(p), where p is the number of variables in the dataset.
#OOB prediction errors are quantified as 1 - R^2 for numeric variables, and as classification error otherwise. If a variable has been imputed only univariately, the value is 1.
#https://rdrr.io/cran/missRanger/man/missRanger.html

We formatted for survival setting.

Code
# Censor date you specified
censor_date <- as.Date("2025-05-28")

SISTRAT_traj <-
  SISTRAT_clean_drop_patient_multiple_imp|>
  tidytable::ungroup()|> 
  # Canonical ordering within each patient
  tidytable::arrange(hash_key, adm_date_rec2, disch_date_rec6)|>
  tidytable::group_by(hash_key)|> 
  # Treatment sequence and entry date
  tidytable::mutate(
    treatment     = tidytable::row_number(),
    is_first_occurrence = tidytable::if_else(treatment == 1, 1, 0),
    max_treatment = max(treatment, na.rm = TRUE),
    enter_date    = min(adm_date_rec2, na.rm = TRUE)
  )|>
  tidytable::mutate(id = cumsum(is_first_occurrence))|>
  tidytable::ungroup()|>
  # Durations in MONTHS via lubridate::time_length (no 30.5 division)
  tidytable::mutate(
    time      = lubridate::time_length(lubridate::interval(enter_date, disch_date_rec6),  unit = "months"),
    cens_time = lubridate::time_length(lubridate::interval(enter_date, censor_date),       unit = "months")
  )|>
  # Tie-breaker for identical times within person: add small offsets 0.001, 0.002, ...
  tidytable::mutate(dup_idx = tidytable::row_number(), .by = c(hash_key, time))|>
  tidytable::mutate(time = time + (dup_idx - 1L) * 0.001)|>
  # Lag (optional diagnostic)
  tidytable::mutate(lag_time = tidytable::lag(time), .by = hash_key)|>
  # Survival time: use censoring if no event (tr_noncompletion == 0)
  tidytable::mutate(
    surv_time = tidytable::if_else(tidytable::coalesce(tr_noncompletion, 0L) == 0L, cens_time, time), 
    id = as.numeric(factor(hash_key))
  )|>
  # Clean up helper
  tidytable::select(-dup_idx, -is_first_occurrence)

We constructed right‑censored survival times from the first recorded admission per patient (time origin), measuring follow‑up in months using calendar‑aware intervals (lubridate::interval + time_length). Episodes were ordered chronologically and minor within‑subject time ties were broken by adding negligible offsets (≤0.002 months) to ensure strictly increasing times. The event was defined as non‑completion of treatment (tr_noncompletion != 0); completions and ongoing treatments at the censoring date (2025‑05‑28) were right‑censored. A numeric patient id was generated from hash_key for modeling. Process is summarised in SISTRAT_traj database.

IrregLong

Abacus

Code
set.seed(21251)
IrregLong::abacus.plot(
    25,
    "time",
    "id",
     data = as.data.frame(SISTRAT_traj),
    0,
    120,
    xlab.abacus = "Time (in months of follow-up)",
    ylab.abacus = "Subject",
    pch.abacus = 16,
    col.abacus = 1
)

Code
counts <- IrregLong::extent.of.irregularity(as.data.frame(SISTRAT_traj),
                                 time="time",
                                 id="id",
   scheduledtimes=NULL, cutpoints=NULL,ncutpts=50, 
   maxfu=16*24, plot=F, legendx=30, legendy=0.8,
  formula=survival::Surv(time.lag,time,event)~1,tau=12*12)
counts_counts<- 
structure(c(0, 0, 1, 0.214519948988887, 0.419402441246129, 
0.366077609764984, 0.385631869800206, 0.432780712941034, 0.18158741725876, 
0.499799599198397, 0.394835124795045, 0.105365276006559, 0.579908908726544, 
0.352370194935325, 0.0677208963381308, 0.638197607335884, 0.315455152729702, 
0.0463472399344143, 0.682565130260521, 0.284079848007704, 0.0333550217317752, 
0.71748496993988, 0.257524139187466, 0.0249908908726544, 0.745661019007712, 
0.2350357280217, 0.0193032529705876, 0.768762980506468, 0.215991983967936, 
0.0152450355255966, 0.788042200102685, 0.199701883105053, 0.0122559167922622, 
0.804253962470395, 0.185829234226028, 0.00991680330357685, 0.818358395111902, 
0.173381728491949, 0.00825987639614894, 0.830687348723421, 0.162212737162637, 
0.00709991411394217, 0.841265561425882, 0.152763709236655, 0.0059707293374628, 
0.850528329386045, 0.144491255237748, 0.00498041537620696, 0.858847105976659, 
0.136909112342332, 0.00424378168100908, 0.866415659602032, 0.129819234428453, 
0.00376510596951479, 0.873116568064358, 0.123627158623467, 0.00325627331217459, 
0.879356895609401, 0.11764073601749, 0.00300236837310986, 0.884843279632859, 
0.11254023197911, 0.00261648838803147, 0.889925305155766, 0.107711290349294, 
0.00236340449494029, 0.894568583807932, 0.103314930255768, 0.00211648593629948, 
0.898783931499362, 0.0993578065221352, 0.00185826197850246, 0.902723264711241, 
0.0955991983967936, 0.00167753689196575, 0.906379191950334, 0.0920848690387769, 
0.00153593901088891, 0.9097697077657, 0.0888241128722091, 0.00140617936209119, 
0.9129245503995, 0.085783254821331, 0.00129219477916873, 0.915873125561468, 
0.082926982491629, 0.00119989194690321, 0.918599623489403, 0.0803145685310014, 
0.00108580797959555, 0.921176076493162, 0.0778225071844569, 0.00100141632238083, 
0.923578976134086, 0.0755101111313536, 0.000910912734560029, 
0.925866057183237, 0.0732771327779526, 0.000856810038810403, 
0.928000385798335, 0.0712119426017811, 0.000787671599884261, 
0.930049189287666, 0.0691908492309294, 0.000759961481404367, 
0.931936600473675, 0.0673792028501447, 0.000684196676180644, 
0.933738730532318, 0.0656300315618953, 0.000631237905787004, 
0.935448888207036, 0.0639690864983556, 0.000582025294608356, 
0.937097505033377, 0.062341933283817, 0.000560561682806172, 0.938649116414647, 
0.0608261978502459, 0.000524685735106577, 0.940133037694013, 
0.059365738128141, 0.000501224177845714, 0.941533283016249, 0.0579991151133436, 
0.000467601870407482, 0.942889584666161, 0.0566553826469006, 
0.000455032686938359, 0.94417347090876, 0.0553950876960533, 0.000431441395187068, 
0.94538734033724, 0.0542175259610129, 0.000395133701746928, 0.946575364167069, 
0.0530357157001751, 0.00038892013275563, 0.94768957644496, 0.0519530356651407, 
0.000357387889899722, 0.948758122305217, 0.050912430922451, 0.000329446772332544, 
0.949789746468819, 0.0499009150025468, 0.000309338528634263, 
0.950787392967754, 0.0489152851156859, 0.000297321916560394), dim = c(3L, 
50L))

# counts$auc
# [1] 0.2248806
# > counts$transformed.auc
# [1] 86.18703
#→ 86 suggests substantial irregularity in observation times across subjects.

plot_irregularity_05_2024<-
rbind.data.frame(cbind.data.frame(time= 1:50,data.frame(t(counts_counts)))) %>% 
    reshape2::melt(id=c("time")) %>% 
    dplyr::mutate(Observations= factor(variable, labels=c("0 obs per bin","1 obs per bin", "2+ obs per bin"))) %>% 
    ggplot(aes(x=time, y=value, linetype=Observations), linewidth=1)+
    geom_line()+
    #facet_wrap(~type)+ #color=factor(type))
    theme_classic()+
    xlab("Bin width (%)")+
    ylab("Mean proportions of individuals")#+

plot_irregularity_05_2024

Code
ggsave(plot_irregularity_05_2024, file= paste0(wdpath,"cons/_figs/irreg250921.jpg"), width=8.5, height=5.5, dpi=500)

The irregularity metrics indicate a meaningful departure from perfectly scheduled repeated measures: although the raw AUC is modest (≈0.225), its transformed value (≈86) points to substantial variability in visit timing. This supports modeling strategies that explicitly accommodate irregular observation times and potential informativeness of assessment schedules—conditionally on a rich set of covariates, including baseline factors, prior outcomes, and visit history. The figure displays mean proportions of subjects with 0, 1, and ≥2 visits per bin as the bin width ranges from 1% to 50% of the inter-admission interval. Notably, in the earliest bins (roughly the first five), multiple observations frequently fall within the same time bin.

Longitudinal formatting

We creates dummy variables (binary 0/1 indicators) from categorical variables in the dataset sdat, for occupation status, starting/initial and primary substance.

We also centered the year variable around 2015 (year_c), making 2015 the baseline (value = 0), which improves coefficient interpretation and reduces multicollinearity. Second, we standardized age by centering it at the mean and scaling by 10 (age_c10), so each unit represents a 10-year difference from the average age—making effects easier to interpret per decade and improving numerical stability. Together, these transformations create meaningful reference points, aid in comparing effect sizes, and support more robust statistical modeling.

Code
# clean memory
gc()

# --- Prepare data: plain data.frame for IrregLong ---
sdat <- as.data.frame(SISTRAT_traj)

# ID as factor; time must be numeric; create visit indicator (every row is a visit)
sdat$id         <- as.numeric(factor(sdat$hash_key))
sdat$time       <- as.numeric(sdat$time)
sdat$event_vis  <- 1L

# Dummified vars
sdat$occ24_inactive <-  as.integer(sdat$occupation_condition_corr24 == "inactive")
sdat$occ24_unemployed <-  as.integer(sdat$occupation_condition_corr24 == "unemployed")
sdat$occ24_employed <-  as.integer(sdat$occupation_condition_corr24 == "employed")

sdat$susinidum_oh <-  as.integer(sdat$sus_ini_mod_mvv == "alcohol")
sdat$susinidum_coc <-  as.integer(sdat$sus_ini_mod_mvv == "cocaine powder")
sdat$susinidum_pbc <-  as.integer(sdat$sus_ini_mod_mvv == "cocaine paste")
sdat$susinidum_mar <-  as.integer(sdat$sus_ini_mod_mvv == "marijuana")
sdat$susinidum_oth <-  as.integer(sdat$sus_ini_mod_mvv == "others")

sdat$susprindum_oh <-  as.integer(sdat$primary_sub_mod == "alcohol")
sdat$susprindum_coc <-  as.integer(sdat$primary_sub_mod == "cocaine powder")
sdat$susprindum_pbc <-  as.integer(sdat$primary_sub_mod == "cocaine paste")
sdat$susprindum_mar <-  as.integer(sdat$primary_sub_mod == "marijuana")
sdat$susprindum_oth <-  as.integer(sdat$primary_sub_mod == "others")

sdat$year_c   <- sdat$year_decimal - 2015
sdat$age_c10  <- (sdat$adm_age_rec3 - mean(sdat$adm_age_rec3, na.rm=TRUE)) / 10

#Ensure treat_log_days exists, is numeric, and finite
#keep NA for non-positive so the lagger can handle it
stopifnot(sdat$treat_log_days[!is.finite(sdat$treat_log_days)]|> length()==0)

sdat$treat_log_days[!is.finite(sdat$treat_log_days)] <- NA_real_
            used   (Mb) gc trigger   (Mb)  max used   (Mb)
Ncells   4365995  233.2    6436530  343.8   6436530  343.8
Vcells 187107649 1427.6  383117362 2923.0 383117301 2923.0

Intercept-only and general weights

Code
cat("Survival time\n")
summary(sdat$cens_time)
#  Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 6.742  90.161 124.871 121.133 157.355 304.871 
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 7.452  95.484 129.484 125.146 159.806 304.871

# Keep your treatment outcome separately (edit if you want a different outcome)
# Here, 1 = noncompletion/dropout; 0 = otherwise
sdat$tr_outcome <- as.integer(sdat$tr_noncompletion)

# --- Max follow-up per subject (use your cens_time in MONTHS) ---
maxfu_df <- aggregate(cens_time ~ id, data = sdat, FUN = function(x) max(x, na.rm = TRUE))
names(maxfu_df) <- c("maxfu.id", "maxfu.time")


# --- Visit-intensity model with inverse-intensity weights ---
# The function will internally create time.lag and tr_outcome.lag because we list them in `lagvars`
i0 <- IrregLong::iiw.weights(
  formulaph  = survival::Surv(time.lag, time, event_vis) ~ tr_outcome.lag + survival::cluster(id),
  # If you want stabilized weights, uncomment and provide a null model (often just cluster):
  # formulanull = ~ survival::cluster(id),
  data       = sdat,
  id         = "id",
  time       = "time",
  event      = "event_vis",          # each row is a visit
  lagvars    = c("time", "tr_outcome"),
  invariant  = "id",
  maxfu      = maxfu_df,             # per-ID maximum follow-up (same units as 'time': months)
  lagfirst   = c(0, 0),              # time.lag = 0 at first visit; tr_outcome.lag = 0 at first visit
  first      = TRUE                  # baseline visit observed with prob = 1
)

Warning in survival::Surv(time.lag, time, event_vis): Stop time must be > start time, NA created

Code
# Model summary for the visit-intensity fit
i0$m
# coxph(formula = formulaph, data = datacox)
# 
#                             coef  exp(coef)   se(coef)       z      p
# tr_outcome.lag        -6.883e-01  5.025e-01  8.237e-03 -83.560 <2e-16
# survival::cluster(id)  3.000e-07  1.000e+00  4.848e-07   0.619  0.536
# 
# Likelihood ratio test=6858  on 2 df, p=< 2.2e-16
# n= 95355, number of events= 67912 
#    (45 observations deleted due to missingness)
mf_all <- stats::model.frame(i0$m, data = i0$datacox, na.action = stats::na.pass)

Warning in survival::Surv(time.lag, time, event_vis): Stop time must be > start time, NA created

Code
keep_idx <- stats::complete.cases(mf_all)
drop_idx <- which(!keep_idx)
# i0$datacox[which(!keep_idx), c("id","time","time.lag","event_vis","tr_outcome","tr_outcome.lag","cens_time")]

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

model_vars <- c(
  "time", "event_vis",  # Survival components
  "tr_noncompletion", 
  "biopsych_comp_severe", 
  "treat_lt_90", 
  "treat_log_days", 
  "polysubstance_strict", 
  "age_c10", 
  "year_c", 
  "susinidum_oh", 
  "susinidum_coc", 
  "susinidum_pbc", 
  "susinidum_mar", 
  "dg_psiq_cie_10_instudy", 
  "dg_psiq_cie_10_dg", 
  "prim_sub_daily", 
  "occ24_inactive", 
  "occ24_unemployed", 
  "susprindum_oh", 
  "susprindum_coc", 
  "susprindum_pbc", 
  "susprindum_mar", 
  "id"
)#age_c10, year_c

cat("Filtered without events with ties")
sdat_filt<- subset(sdat, !id %in% i0$datacox[which(!keep_idx),"id"])
maxfu_df_filt <- subset(maxfu_df, !maxfu.id %in% i0$datacox[which(!keep_idx),"id"])

cat("Mean positive days for lag treat_log_days at 90 days\n")
filter(sdat, dit_rec6==90)|> select(dit_rec6, treat_log_days)|> pull(treat_log_days)|> mean()

i1_adj <- IrregLong::iiw.weights(
  formulaph  = survival::Surv(time.lag,time,event_vis)~ tr_noncompletion.lag+ biopsych_comp_severe.lag+ treat_lt_90.lag+ treat_log_days.lag+ polysubstance_strict.lag+ age_c10+ year_c+ susinidum_oh+ susinidum_coc+ susinidum_pbc+ susinidum_mar+ dg_psiq_cie_10_instudy+ dg_psiq_cie_10_dg+  prim_sub_daily+ occ24_inactive+ occ24_unemployed+ susprindum_oh+ susprindum_coc+ susprindum_pbc+ susprindum_mar+ survival::cluster(id),
  # If you want stabilized weights, uncomment and provide a null model (often just cluster):
  # formulanull = ~ survival::cluster(id),
  data       = sdat,#[,model_vars],
  id         = "id",
  time       = "time",
  event      = "event_vis",          # each row is a visit
  lagvars= c("time", "tr_noncompletion", "biopsych_comp_severe", "treat_lt_90", "treat_log_days", "polysubstance_strict"),
  invariant  = "id",
  maxfu      = maxfu_df,             # per-ID maximum follow-up (same units as 'time': months)
  lagfirst   = c(5.149111,1,1,1,4.51086,1),              # time.lag = 0 at first visit; tr_outcome.lag = 0 at first visit  #90/30.5 4.51086 es 90 días
  first      = TRUE                  # baseline visit observed with prob = 1
)

Warning in survival::Surv(time.lag, time, event_vis): Stop time must be > start time, NA created

Code
# Call iiw.weights w/0s
i1_adj_0 <- IrregLong::iiw.weights(
  formulaph  = survival::Surv(time.lag, time, event_vis) ~
    tr_noncompletion.lag + biopsych_comp_severe.lag + treat_lt_90.lag +
    treat_log_days.lag + polysubstance_strict.lag +
    age_c10+ year_c+ 
    susinidum_oh + susinidum_coc + susinidum_pbc + susinidum_mar +
    dg_psiq_cie_10_instudy + dg_psiq_cie_10_dg + prim_sub_daily +
    occ24_inactive + occ24_unemployed +
    susprindum_oh + susprindum_coc + susprindum_pbc + susprindum_mar +
    survival::cluster(id),
  data       = sdat,
  id         = "id",
  time       = "time",
  event      = "event_vis",
  # include *all* variables you want lagged (and in the same order for lagfirst)
  lagvars    = c("time", "tr_noncompletion", "biopsych_comp_severe",
                 "treat_lt_90", "treat_log_days", "polysubstance_strict"),
  invariant  = "id",
  maxfu      = maxfu_df,
  # use zeros to avoid integer/double coercion warnings
  lagfirst   = c(5.149111/2, 0, 0, 0, 2.772589, 0),

  first      = TRUE
)

Warning in survival::Surv(time.lag, time, event_vis): Stop time must be > start time, NA created

Code
rbind.data.frame(
cbind.data.frame(model= "Primary (after PH, lag=0)",  t(matrix(summary(i1_adj_0$iiw.weight))) ),
cbind.data.frame(model= "Alternative (after PH, lag=1)", t(matrix(summary(i1_adj$iiw.weight)))) )%>% 
  { 
    knitr::kable(., digits=2, size=10, format="markdown", caption="Weights (no correction for PHs)", col.names= c("Weight",attr(summary(i1_adj$iiw.weight),"names")) ) 
  }
# |Weight                        | Min.| 1st Qu.| Median| Mean| 3rd Qu.| Max.| NA's|
# |:-----------------------------|----:|-------:|------:|----:|-------:|----:|----:|
# |Primary (after PH, lag=0)     | 0.17|    1.00|   1.11| 1.58|    1.97| 9.03|  982|
# |Alternative (after PH, lag=1) | 0.09|    0.45|   0.86| 0.74|    1.00| 2.35|  982|

# |Weight                        | Min.| 1st Qu.| Median| Mean| 3rd Qu.| Max.|
# |:-----------------------------|----:|-------:|------:|----:|-------:|----:|
# |Primary (after PH, lag=0)     | 0.16|    1.00|   1.09| 1.55|    1.92| 8.78|
# |Alternative (after PH, lag=1) | 0.08|    0.47|   0.93| 0.76|    1.00| 2.56|
Survival time
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  7.452  95.484 129.484 125.146 159.806 304.871 
Call:
coxph(formula = formulaph, data = datacox)

                            coef  exp(coef)   se(coef)       z      p
tr_outcome.lag        -6.883e-01  5.025e-01  8.237e-03 -83.560 <2e-16
survival::cluster(id)  3.000e-07  1.000e+00  4.848e-07   0.619  0.536

Likelihood ratio test=6858  on 2 df, p=< 2.2e-16
n= 95355, number of events= 67912 
   (45 observations deleted due to missingness)
Filtered without events with tiesMean positive days for lag treat_log_days at 90 days
[1] 4.51086
Weights (no correction for PHs)
Weight Min. 1st Qu. Median Mean 3rd Qu. Max.
Primary (after PH, lag=0) 0.16 1.00 1.09 1.55 1.92 8.77
Alternative (after PH, lag=1) 0.08 0.47 0.93 0.76 1.00 2.56

We built patient trajectories in SISTRAT by ordering episodes within each hash_key, labeling sequential treatments, and defining each person’s entry date. Using lubridate::time_length() on interval(enter_date, disch_date) and interval(enter_date, 2025-05-28), we computed follow-up times in months (time, cens_time), then resolved ties within subjects by adding tiny offsets so repeated times don’t collide. We set the survival time (surv_time) to time for events and to cens_time otherwise (event proxied by tr_noncompletion). To examine visit irregularity, we used IrregLong’s diagnostics and summarized the proportions with 0, 1, and ≥2 visits across time bins. Finally, we fitted a visit-intensity (counting-process) model with IrregLong::iiw.weights()—treating each row as a visit, lagging key covariates internally, and supplying per-subject maximum follow-up—to obtain inverse-intensity weights for downstream outcome analyses that account for irregular and potentially informative visit times.

The visit‐intensity Cox model shows that the lagged treatment outcome is strongly associated with when the next visit is observed: conditional on the model—the instantaneous rate of being observed at a visit is about 64% lower following a prior non-completion event. This provides clear evidence of informative visit times, justifying the use of inverse-intensity weights. The overall likelihood ratio test is highly significant (χ² ≈ 6858 on 2 df), with 95,355 intervals and 67,912 visit events (45 rows dropped for missingness).

Weights by plan type

We fixed 0-length intervals in dat database. Those rows contribute no person-time. We winsorized and renormalized weight values. This was done by restricting values below the 2.5th percentile to that percentile, and values above the 97.5th to the 97.5th percentile.

Code
#Great—those 45 rows are exactly the ones with zero-length intervals: time == 0 and time.lag == 0 (baseline rows with event_vis == 1). In a counting-process Surv(start, stop, event), stop must be strictly greater than start, so these get dropped.
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#Fix: precompute a baseline-safe lag for the covariate yourself and don’t ask IrregLong to re-lag it. Only let IrregLong lag time. This avoids baseline NAs and keeps rows.

## Start from i0$datacox
dat <- data.table::as.data.table(sdat)  # ensure data.table

## Sort
data.table::setorder(dat, id, time)

## Make intervals strictly positive & ordered
eps <- 1e-3
dat[, .idx := seq_len(.N) - 1L, by = .(id, time)]
dat[, time := time + eps * .idx][, .idx := NULL]
dat[, time.lag := data.table::shift(time, type = "lag", fill = 0), by = .(id)]
dat[time <= time.lag, time := time.lag + eps]

## Lagged covariate with baseline = 0 (no NAs on first row)
dat[, tr_outcome_lag0 := data.table::shift(tr_outcome, type = "lag"), by = .(id)]
dat[is.na(tr_outcome_lag0), tr_outcome_lag0 := 0L]

## Per-id max follow-up (keep id types consistent)
dat[, id := as.character(id)]
maxfu_df <- dat[, .(maxfu.time = max(cens_time, na.rm = TRUE)),
                by = .(maxfu.id = id)]

dat[, id := as.character(id)]
data.table::setDF(dat); maxfu_df <- as.data.frame(maxfu_df)

## Refit: only let IrregLong lag time; use our tr_outcome_lag0 in the formula
i0_3 <- IrregLong::iiw.weights(
  formulaph  = survival::Surv(time.lag, time, event_vis) ~ tr_outcome_lag0 + survival::cluster(id),
  data       = dat,
  id         = "id",
  time       = "time",
  event      = "event_vis",
  lagvars    = c("time"),   # we already created tr_outcome_lag0
  invariant  = "id",
  maxfu      = maxfu_df,
  lagfirst   = c(0),        # first time.lag = 0
  first      = TRUE
)

Warning in survival::Surv(time.lag, time, event_vis): Stop time must be > start time, NA created

Code
## Sanity check: how many rows would coxph drop now?
mf3 <- stats::model.frame(i0_3$m, data = i0_3$datacox, na.action = stats::na.pass)

Warning in survival::Surv(time.lag, time, event_vis): Stop time must be > start time, NA created

Code
sum(!stats::complete.cases(mf3))  # should be ~0 (or just the true oddballs), not ~31928

table(is.na(i0_3$iiw.weight))
invisible("didnt change, despite no more NAs in weights")

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

cat("Mean positive days for lag treat_log_days at 15 days\n")
filter(sdat, dit_rec6==15)|> select(dit_rec6, treat_log_days)|> pull(treat_log_days)|> mean()

cat("$:$:$:$:$:$:$:$:$:$:$:$:$:$:\n")
cat("Weights by plan type\n")
cat("$:$:$:$:$:$:$:$:$:$:$:$:$:$:\n")

plan_types <- unique(dat$plan_type_mod)

iiw_by_plan0 <- lapply(plan_types, function(pt) {
  cat("Running for:", pt, "\n")
  # Subset as plain data.frame
  dat_sub <- dat |> filter(plan_type_mod == pt) |> as.data.frame()
  # Subset maxfu_df to only the ids in dat_sub
  maxfu_sub <- maxfu_df[maxfu_df$maxfu.id %in% dat_sub$id, , drop = FALSE]
  IrregLong::iiw.weights(
    formulaph  = survival::Surv(time.lag, time, event_vis) ~ 
      tr_noncompletion.lag + biopsych_comp_severe.lag + 
      treat_lt_90.lag + treat_log_days.lag + polysubstance_strict.lag +
      age_c10 + year_c +
      susinidum_oh + susinidum_coc + susinidum_pbc + susinidum_mar +
      dg_psiq_cie_10_instudy + dg_psiq_cie_10_dg + prim_sub_daily +
      occ24_inactive + occ24_unemployed +
      susprindum_oh + susprindum_coc + susprindum_pbc + susprindum_mar +
      survival::cluster(id),
    data       = dat_sub,
    id         = "id",
    time       = "time",
    event      = "event_vis",
    lagvars    = c("time", "tr_noncompletion", "biopsych_comp_severe",
                   "treat_lt_90", "treat_log_days", "polysubstance_strict"),
    invariant  = "id",
    maxfu      = maxfu_sub,  # stratified by subset IDs
    lagfirst   = c(5.149111, 0, 0, 0, 2.772589, 0),
    first      = TRUE
  )
})

Warning in survival::Surv(time.lag, time, event_vis): Stop time must be > start time, NA created

Warning in survival::Surv(time.lag, time, event_vis): Stop time must be > start time, NA created

Warning in survival::Surv(time.lag, time, event_vis): Stop time must be > start time, NA created

Warning in survival::Surv(time.lag, time, event_vis): Stop time must be > start time, NA created

Warning in survival::Surv(time.lag, time, event_vis): Stop time must be > start time, NA created

Code
names(iiw_by_plan0) <- plan_types

iiw_by_plan1 <- lapply(plan_types, function(pt) {
  cat("Running for:", pt, "\n")
  # Subset as plain data.frame
  dat_sub <- dat |> filter(plan_type_mod == pt) |> as.data.frame()
  # Subset maxfu_df to only the ids in dat_sub
  maxfu_sub <- maxfu_df[maxfu_df$maxfu.id %in% dat_sub$id, , drop = FALSE]
  IrregLong::iiw.weights(
    formulaph  = survival::Surv(time.lag, time, event_vis) ~ 
      tr_noncompletion.lag + biopsych_comp_severe.lag + 
      treat_lt_90.lag + treat_log_days.lag + polysubstance_strict.lag +
      age_c10 + year_c +
      susinidum_oh + susinidum_coc + susinidum_pbc + susinidum_mar +
      dg_psiq_cie_10_instudy + dg_psiq_cie_10_dg + prim_sub_daily +
      occ24_inactive + occ24_unemployed +
      susprindum_oh + susprindum_coc + susprindum_pbc + susprindum_mar +
      survival::cluster(id),
    data       = dat_sub,
    id         = "id",
    time       = "time",
    event      = "event_vis",
    lagvars    = c("time", "tr_noncompletion", "biopsych_comp_severe",
                   "treat_lt_90", "treat_log_days", "polysubstance_strict"),
    invariant  = "id",
    maxfu      = maxfu_sub,  # stratified by subset IDs
    lagfirst   = c(5.149111, 1, 1, 1, 4.51086, 1),
    first      = TRUE
  )
})

Warning in survival::Surv(time.lag, time, event_vis): Stop time must be > start time, NA created

Warning in survival::Surv(time.lag, time, event_vis): Stop time must be > start time, NA created

Warning in survival::Surv(time.lag, time, event_vis): Stop time must be > start time, NA created

Warning in survival::Surv(time.lag, time, event_vis): Stop time must be > start time, NA created

Warning in survival::Surv(time.lag, time, event_vis): Stop time must be > start time, NA created

Code
names(iiw_by_plan1) <- plan_types


#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("Extract weights\n")
extract_weights <- function(iiw_by_plan, dat, 
                            weight_name = "iiw.weight", 
                            out_col = "iiw_weight", 
                            na_to_one = TRUE) {
  weights_list <- lapply(names(iiw_by_plan), function(pt) {
    mod <- iiw_by_plan[[pt]]
    dat_sub <- dat[dat$plan_type_mod == pt, , drop = FALSE]
    # Check weight existence
    if (!weight_name %in% names(mod)) {
      stop(sprintf("Weight '%s' not found in model for plan type '%s'",
                   weight_name, pt))
    }
    w <- mod[[weight_name]]
    if (length(w) != nrow(dat_sub)) {
      stop(sprintf("Row mismatch for %s: %d weights vs %d rows",
                   pt, length(w), nrow(dat_sub)))
    }
    if (na_to_one) {
      w[is.na(w)] <- 1
    }
    dat_sub[[out_col]] <- w
    dat_sub
  })
  do.call(rbind, weights_list)
}

dat_with_weights_stab <- extract_weights(iiw_by_plan0, dat, 
                                         weight_name = "iiw.weight", 
                                         out_col = "iiw_weight0_stab")


dat_with_weights_stab1 <- extract_weights(iiw_by_plan1, dat, 
                                         weight_name = "iiw.weight", 
                                         out_col = "iiw_weight1_stab")

dat_with_weights_stab_cons<- 
dat_with_weights_stab|> #67957
  left_join(dat_with_weights_stab1[,c("rn","iiw_weight1_stab")], by="rn")|>
  arrange(hash_key, adm_age_rec3)

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

cat("Add weight 1 in case of first treatment for time.lag==0\n")
table(subset(dat_with_weights_stab_cons, treatment==1)$time.lag)

dat_with_weights_stab_cons$iiw_weight0_stab <- ifelse(
  is.na(dat_with_weights_stab_cons$iiw_weight0_stab) & dat_with_weights_stab_cons$time.lag == 0,
  1,
  dat_with_weights_stab_cons$iiw_weight0_stab
)
dat_with_weights_stab_cons$iiw_weight1_stab <- ifelse(
  is.na(dat_with_weights_stab_cons$iiw_weight1_stab) & dat_with_weights_stab_cons$time.lag == 0,
  1,
  dat_with_weights_stab_cons$iiw_weight1_stab
)
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

rbind.data.frame(
cbind.data.frame(model= "Primary (after PH, lag=0)",
                 t(matrix(summary(dat_with_weights_stab_cons$iiw_weight0_stab ))) ),
cbind.data.frame(model= "Alternative (after PH, lag=1)",  t(matrix(summary(dat_with_weights_stab_cons$iiw_weight1_stab)))) )%>% 
  { 
    knitr::kable(., digits=2, size=10, format="markdown", caption="Weights (no correction for PHs)", col.names= c("Weight",attr(summary(dat_with_weights_stab_cons$iiw_weight1_stab),"names")) ) 
  }

w_primary_tr <- cap(dat_with_weights_stab_cons$iiw_weight0_stab, c(0.025, 0.975))
w_alt_tr     <- cap(dat_with_weights_stab_cons$iiw_weight1_stab, c(0.025, 0.975))


w0      <- dat_with_weights_stab_cons$iiw_weight0_stab
w0_tr   <- renorm(cap(w0, c(0.025, 0.975)))
w1      <- dat_with_weights_stab_cons$iiw_weight1_stab
w1_tr   <- renorm(cap(w1, c(0.025, 0.975)))

cbind.data.frame(type= c("Lag=0, raw","lag=0 trunc 2.5–97.5%","lag=1 raw","lag=1 trunc 2.5–97.5%"),
dplyr::bind_rows(
  summ_w(w0,    "lag=0 raw"),
  summ_w(w0_tr, "lag=0 trunc 2.5–97.5%"),
  summ_w(w1,    "lag=1 raw"),
  summ_w(w1_tr, "lag=1 trunc 2.5–97.5%")
))|> knitr::kable("markdown", digits=2, caption= "Weights, stabilized and truncated")
# |type                  | n_non_na| n_na|  min|  q01|  q25| median| mean|  q75|  q99|  max| prop_lt_0.2| prop_gt_5|      ESS|
# |:---------------------|--------:|----:|----:|----:|----:|------:|----:|----:|----:|----:|-----------:|---------:|--------:|
# |Lag=0, raw            |    67957|    0| 0.19| 0.36| 1.00|   1.00| 1.06| 1.00| 2.67| 7.05|           0|         0| 59450.19|
# |lag=0 trunc 2.5–97.5% |    67957|    0| 0.42| 0.42| 0.95|   0.95| 1.00| 0.95| 2.11| 2.11|           0|         0| 61803.67|
# |lag=1 raw             |    67957|    0| 0.15| 0.35| 1.00|   1.00| 1.07| 1.00| 2.82| 8.25|           0|         0| 58250.96|
# |lag=1 trunc 2.5–97.5% |    67957|    0| 0.41| 0.41| 0.95|   0.95| 1.00| 0.95| 2.14| 2.14|           0|         0| 61572.35|
# INTEGRATE!
dat_with_weights_stab_cons$w0_tr <-  w0_tr
dat_with_weights_stab_cons$w1_tr <-  w1_tr
[1] 27444

FALSE 
67957 
Mean positive days for lag treat_log_days at 15 days
[1] 2.772589
$:$:$:$:$:$:$:$:$:$:$:$:$:$:
Weights by plan type
$:$:$:$:$:$:$:$:$:$:$:$:$:$:
Running for: GP basic ambulatory 
Running for: GP residential 
Running for: WO residential 
Running for: GP intensive ambulatory 
Running for: WO intensive ambulatory 
Running for: GP basic ambulatory 
Running for: GP residential 
Running for: WO residential 
Running for: GP intensive ambulatory 
Running for: WO intensive ambulatory 
Extract weights
Add weight 1 in case of first treatment for time.lag==0

    0 
27445 
Weights (no correction for PHs)
Weight Min. 1st Qu. Median Mean 3rd Qu. Max.
Primary (after PH, lag=0) 0.15 1.00 1.02 1.46 1.78 7.35
Alternative (after PH, lag=1) 0.08 0.51 0.93 0.78 1.00 3.12
Weights, stabilized and truncated
type n_non_na n_na min q01 q25 median mean q75 q99 max prop_lt_0.2 prop_gt_5 ESS
Lag=0, raw 67957 0 0.15 0.59 1.00 1.02 1.46 1.78 3.93 7.35 0.00 0 54514.18
lag=0 trunc 2.5–97.5% 67957 0 0.50 0.50 0.69 0.71 1.00 1.23 2.33 2.33 0.00 0 56078.17
lag=1 raw 67957 0 0.08 0.19 0.51 0.93 0.78 1.00 1.46 3.12 0.01 0 59071.99
lag=1 trunc 2.5–97.5% 67957 0 0.29 0.29 0.66 1.21 1.00 1.29 1.64 1.64 0.00 0 59698.29
Intermediate saving

Save the current R environment at that intermediate step.

Code
#save.image(file = file.path(wdpath, "data/20241015_out/psu", "intermediate.RData"))
Association between non-completion and PSU, general
Code
gc()
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))

#load(file = file.path(wdpath, "data/20241015_out/psu", "intermediate.RData"))

if (interactive()) {
job::job({
  gc()
    suppressPackageStartupMessages({
    library(dplyr)
    library(sandwich)
  })
  ## -------------------- MODELO --------------------
  ff_formula <- tr_noncompletion ~ polysubstance_strict + age_c10 + year_c +
    susinidum_oh + susinidum_coc + susinidum_pbc + susinidum_mar +
    dg_psiq_cie_10_instudy + dg_psiq_cie_10_dg +
    prim_sub_daily + occ24_inactive + occ24_unemployed +
    susprindum_coc + susprindum_pbc + susprindum_mar

  vars_ff    <- all.vars(ff_formula)
  vars_extra <- c("id", "w0_tr", "w1_tr")
  vars_need  <- unique(c(vars_ff, vars_extra))

  # No data.table
  base <- as.data.frame(dat_with_weights_stab_cons)[, vars_need, drop = FALSE]

  # Model frame without NAs to construct 'keep' mask 
  mf_all <- stats::model.frame(ff_formula, data = base, na.action = NULL)
  keep   <- stats::complete.cases(mf_all)
  if (!any(keep)) stop("No hay casos completos para la fórmula del modelo.")

  mf <- mf_all[keep, , drop = FALSE]                    # datos alineados con la fórmula
  id_vec <- base$id[keep]
  if (anyNA(id_vec)) stop("La variable 'id' contiene NA tras el filtrado.")
  id_int <- as.integer(factor(id_vec, exclude = NULL))  # clúster compacto y estable

  ## Clean weights (positive, finite, mean=1), aligned to 'keep'
  clean_weights <- function(w) {
    w[!is.finite(w) | is.na(w) | w <= 0] <- 1
    w / mean(w, na.rm = TRUE)
  }
  w0 <- clean_weights(base$w0_tr[keep])
  w1 <- clean_weights(base$w1_tr[keep])

  stopifnot(is.numeric(w0), is.numeric(w1),
            length(w0) == nrow(mf), length(w1) == nrow(mf))

  ncl <- length(unique(id_int))

  ## -------------------- ROBUST --------------------
  ## --- Fit with cluster-robust SEs; embed weights into data to avoid scoping bugs ---
  fit_glm_robust <- function(weights_vec = NULL) {
    df <- mf
    if (is.null(weights_vec)) {
      m <- stats::glm(ff_formula, data = df, family = stats::poisson())
    } else {
      df$.__w <- weights_vec                     # <- embed weights here
      m <- stats::glm(ff_formula, data = df, family = stats::poisson(), weights = .__w)
    }
    vc <- if (ncl >= 2) sandwich::vcovCL(m, cluster = id_int) else sandwich::vcovHC(m, type = "HC0")
    list(model = m, vcov = vc)
  }

  m_un <- fit_glm_robust(NULL)
  m_w0 <- fit_glm_robust(w0)
  m_w1 <- fit_glm_robust(w1)

  ## -------------------- RESULTS --------------------
  extract_results <- function(obj, name) {
    co <- stats::coef(obj$model)
    se <- sqrt(diag(obj$vcov))
    z  <- stats::qnorm(0.975)
    out <- data.frame(
      term    = names(co),
      est     = exp(co),
      cilo    = exp(co - z * se),
      ciup    = exp(co + z * se),
      p.value = 2 * stats::pnorm(abs(co / se), lower.tail = FALSE),
      check.names = FALSE
    )
    names(out)[-1] <- paste0(names(out)[-1], "_", name)
    out
  }

  est_summary <- Reduce(
    function(x, y) dplyr::full_join(x, y, by = "term"),
    list(
      extract_results(m_un, "un"),
      extract_results(m_w0, "w0"),
      extract_results(m_w1, "w1")
    )
  )

  utils::write.csv(est_summary,  file.path(wdpath, "data/20241015_out/psu", "simple_results.csv"), row.names = FALSE)
  cat("[ok] Saved in simple_results.csv (", nrow(est_summary), " rows)\n")
  print(utils::head(est_summary), row.names = FALSE)
  })
} else {
  gc()
    suppressPackageStartupMessages({
    library(dplyr)
    library(sandwich)
  })
  ## -------------------- MODELO --------------------
  ff_formula <- tr_noncompletion ~ polysubstance_strict + age_c10 + year_c +
    susinidum_oh + susinidum_coc + susinidum_pbc + susinidum_mar +
    dg_psiq_cie_10_instudy + dg_psiq_cie_10_dg +
    prim_sub_daily + occ24_inactive + occ24_unemployed +
    susprindum_coc + susprindum_pbc + susprindum_mar

  vars_ff    <- all.vars(ff_formula)
  vars_extra <- c("id", "w0_tr", "w1_tr")
  vars_need  <- unique(c(vars_ff, vars_extra))

  # No data.table
  base <- as.data.frame(dat_with_weights_stab_cons)[, vars_need, drop = FALSE]

  # Model frame without NAs to construct 'keep' mask 
  mf_all <- stats::model.frame(ff_formula, data = base, na.action = NULL)
  keep   <- stats::complete.cases(mf_all)
  if (!any(keep)) stop("No hay casos completos para la fórmula del modelo.")

  mf <- mf_all[keep, , drop = FALSE]                    # datos alineados con la fórmula
  id_vec <- base$id[keep]
  if (anyNA(id_vec)) stop("La variable 'id' contiene NA tras el filtrado.")
  id_int <- as.integer(factor(id_vec, exclude = NULL))  # clúster compacto y estable

  ## Clean weights (positive, finite, mean=1), aligned to 'keep'
  clean_weights <- function(w) {
    w[!is.finite(w) | is.na(w) | w <= 0] <- 1
    w / mean(w, na.rm = TRUE)
  }
  w0 <- clean_weights(base$w0_tr[keep])
  w1 <- clean_weights(base$w1_tr[keep])

  stopifnot(is.numeric(w0), is.numeric(w1),
            length(w0) == nrow(mf), length(w1) == nrow(mf))

  ncl <- length(unique(id_int))

  ## -------------------- ROBUST --------------------
  ## --- Fit with cluster-robust SEs; embed weights into data to avoid scoping bugs ---
  fit_glm_robust <- function(weights_vec = NULL) {
    df <- mf
    if (is.null(weights_vec)) {
      m <- stats::glm(ff_formula, data = df, family = stats::poisson())
    } else {
      df$.__w <- weights_vec                     # <- embed weights here
      m <- stats::glm(ff_formula, data = df, family = stats::poisson(), weights = .__w)
    }
    vc <- if (ncl >= 2) sandwich::vcovCL(m, cluster = id_int) else sandwich::vcovHC(m, type = "HC0")
    list(model = m, vcov = vc)
  }

  m_un <- fit_glm_robust(NULL)
  m_w0 <- fit_glm_robust(w0)
  m_w1 <- fit_glm_robust(w1)

  ## -------------------- RESULTS --------------------
  extract_results <- function(obj, name) {
    co <- stats::coef(obj$model)
    se <- sqrt(diag(obj$vcov))
    z  <- stats::qnorm(0.975)
    out <- data.frame(
      term    = names(co),
      est     = exp(co),
      cilo    = exp(co - z * se),
      ciup    = exp(co + z * se),
      p.value = 2 * stats::pnorm(abs(co / se), lower.tail = FALSE),
      check.names = FALSE
    )
    names(out)[-1] <- paste0(names(out)[-1], "_", name)
    out
  }

  est_summary <- Reduce(
    function(x, y) dplyr::full_join(x, y, by = "term"),
    list(
      extract_results(m_un, "un"),
      extract_results(m_w0, "w0"),
      extract_results(m_w1, "w1")
    )
  )

  utils::write.csv(est_summary,  file.path(wdpath, "data/20241015_out/psu", "simple_results.csv"), row.names = FALSE)
  cat("[ok] Saved in simple_results.csv (", nrow(est_summary), " rows)\n")
  print(utils::head(est_summary), row.names = FALSE)  
}
gc()

est_summary |>
  dplyr::rename(
    var   = term,
    lo_un = cilo_un,
    hi_un = ciup_un,
    p_un  = p.value_un,
    lo_w0 = cilo_w0,
    hi_w0 = ciup_w0,
    p_w0  = p.value_w0,
    lo_w1 = cilo_w1,
    hi_w1 = ciup_w1,
    p_w1  = p.value_w1
  )|> 
  dplyr::filter(grepl("poly",var))|> 
  knitr::kable("markdown", caption="Summary of associations by weights, Generalized model", digits=2)
            used   (Mb) gc trigger   (Mb)  max used   (Mb)
Ncells   4505288  240.7    7763836  414.7   7763836  414.7
Vcells 339456485 2589.9  551865000 4210.4 459820308 3508.2
[ok] Saved in simple_results.csv ( 16  rows)
                 term    est_un   cilo_un   ciup_un   p.value_un    est_w0
          (Intercept) 0.5633872 0.5181064 0.6126254 4.479191e-41 0.5442844
 polysubstance_strict 1.0430549 1.0309425 1.0553096 1.511658e-12 1.0511400
              age_c10 0.8997996 0.8891297 0.9105975 2.055947e-67 0.8997624
               year_c 0.9962844 0.9951192 0.9974509 4.517183e-10 0.9961477
         susinidum_oh 1.0276423 0.9563340 1.1042677 4.574001e-01 1.0505182
        susinidum_coc 1.0572570 0.9824007 1.1378172 1.372652e-01 1.0787129
   cilo_w0   ciup_w0   p.value_w0    est_w1   cilo_w1   ciup_w1   p.value_w1
 0.4963808 0.5968110 2.649867e-38 0.5608757 0.5141676 0.6118269 7.775737e-39
 1.0376779 1.0647768 3.356649e-14 1.0457489 1.0327894 1.0588710 2.051321e-12
 0.8881618 0.9115144 2.706333e-57 0.8968725 0.8856240 0.9082638 4.332718e-64
 0.9948846 0.9974125 2.491781e-09 0.9960187 0.9947877 0.9972513 2.578989e-10
 0.9703832 1.1372708 2.234709e-01 1.0225419 0.9495085 1.1011927 5.554597e-01
 0.9948215 1.1696787 6.661339e-02 1.0500175 0.9734672 1.1325874 2.063375e-01
            used   (Mb) gc trigger   (Mb)  max used   (Mb)
Ncells   4531972  242.1    7763836  414.7   7763836  414.7
Vcells 347809188 2653.6  551865000 4210.4 513139241 3915.0
Summary of associations by weights, Generalized model
var est_un lo_un hi_un p_un est_w0 lo_w0 hi_w0 p_w0 est_w1 lo_w1 hi_w1 p_w1
polysubstance_strict 1.04 1.03 1.06 0 1.05 1.04 1.06 0 1.05 1.03 1.06 0

Descriptives, again

Code
cat("Survival time\n")
summary(dat$cens_time)
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 6.7    90.2   124.9   121.1   157.4   304.9 
#  Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 7.452  95.484 129.484 125.146 159.806 304.871     

cat("Discard patients with only 1 treatment\n")

# Pick the strata variable automatically
strata_var <- if ("polysubstance_strict" %in% names(dat_with_weights_stab_cons)) "polysubstance_strict" else "polysubstance"


# Variables of interest (as you listed)
vars_interest2 <- c(
  "tr_noncompletion",
  "biopsych_comp_severe",          # Biopsychosocial compromise
  "treat_lt_90",
  "treat_log_days",
  "adm_age_rec3",                  # Age at Admission (First Entry)
  "year_decimal",                # Birth year (assumed numeric or year-like)
  "susinidum_oh",               # Primary substance of the initial diagnosis
  "susinidum_coc",
  "susinidum_pbc",
  "susinidum_mar",
  "susprindum_oth",
  "dg_psiq_cie_10_dg",             # Psychiatric comorbidity (ICD-10)
  "dg_psiq_cie_10_instudy",        # Psychiatric comorbidity (ICD-10) [In study]
  #"cie10_broad_cat",               # Broad diagnostic categories
  #"prim_sub_freq",                 # Frequency of primary substance use at admission
  "prim_sub_daily",                # Daily (0/1 you created)
  "occ24_inactive",   # Occupational status (corrected)
  "occ24_unemployed",
  "occ24_employed",
  "primary_sub_mod",                    # Primary substance at admission to treatment
  "susprindum_oh",
  "susprindum_coc",
  "susprindum_pbc",
  "susprindum_mar",
  "susinidum_oth"
  )

# Which of those should be treated as categorical in TableOne
factorVars_interest2 <- c(
  "biopsych_comp_severe",
  "susinidum_oh", 
  "susinidum_coc",
  "susinidum_pbc",
  "susinidum_mar",
  "susinidum_oth",
  "dg_psiq_cie_10_dg",
  "dg_psiq_cie_10_instudy",
  #"cie10_broad_cat",
  #"prim_sub_freq",
  "prim_sub_daily",               # treat 0/1 as categorical to show counts/%
  "occ24_inactive",
  "occ24_unemployed",
  "occ24_employed",
  "susprindum_oh",
  "susprindum_coc",
  "susprindum_pbc",
  "susprindum_mar",
  "susinidum_oth",
  "tr_noncompletion",
  "treat_lt_90"
)

# 2) Labels (these will be picked up by packages like table1/gtsummary; TableOne itself doesn’t use labels)
attr(dat_with_weights_stab_cons$tr_noncompletion,          "label") <- "Non-completion status of treatment (Dropout / Misspelled)"
attr(dat_with_weights_stab_cons$biopsych_comp_severe,             "label") <- "Biopsychosocial compromise (Severe)"
attr(dat_with_weights_stab_cons$treat_lt_90,               "label") <- "Treatment duration (binary) (<90 days)"
attr(dat_with_weights_stab_cons$treat_log_days,            "label") <- "Treatment duration (log-scaled days)"
attr(dat_with_weights_stab_cons$adm_age_rec3,              "label") <- "Age at admission to treatment"
attr(dat_with_weights_stab_cons$year_decimal,            "label") <- "Birth year"

# Primary substance (initial diagnosis) – label the variable; levels print as rows
attr(dat_with_weights_stab_cons$susinidum_oh,           "label") <- "Primary substance (initial diagnosis), alcohol"
attr(dat_with_weights_stab_cons$susinidum_coc,           "label") <- "Primary substance (initial diagnosis), cocaine hydrochloride"
attr(dat_with_weights_stab_cons$susinidum_pbc,           "label") <- "Primary substance (initial diagnosis), cocaine base paste"
attr(dat_with_weights_stab_cons$susinidum_mar,           "label") <- "Primary substance (initial diagnosis), marijuana"
attr(dat_with_weights_stab_cons$susinidum_oth,           "label") <- "Primary substance (initial diagnosis), others"
# Psychiatric comorbidity
attr(dat_with_weights_stab_cons$dg_psiq_cie_10_instudy,    "label") <- "Psychiatric comorbidity (ICD-10), In study"
attr(dat_with_weights_stab_cons$dg_psiq_cie_10_dg,         "label") <- "Psychiatric comorbidity (ICD-10), Diagnosis"

# Frequency / Daily
attr(dat_with_weights_stab_cons$prim_sub_daily,            "label") <- "Daily frequency of primary substance used at admission"

# Occupational status (levels like “Inactive”, “Unemployed” will print under this)
attr(dat_with_weights_stab_cons$occ24_inactive,"label") <- "Occupational Status, inactive"
attr(dat_with_weights_stab_cons$occ24_unemployed,"label") <- "Occupational Status, unemployed"
attr(dat_with_weights_stab_cons$occ24_employed,"label") <- "Occupational Status, employed"

# Primary substance at admission – label the variable; levels print as rows
attr(dat_with_weights_stab_cons$susprindum_oh,               "label") <- "Primary substance at admission to treatment, alcohol"
attr(dat_with_weights_stab_cons$susprindum_coc,               "label") <- "Primary substance at admission to treatment, cocaine hydrochloride"
attr(dat_with_weights_stab_cons$susprindum_pbc,               "label") <- "Primary substance at admission to treatment, cocaine base paste"
attr(dat_with_weights_stab_cons$susprindum_mar,               "label") <- "Primary substance at admission to treatment, marijuana"
attr(dat_with_weights_stab_cons$susprindum_oth,               "label") <- "Primary substance at admission to treatment, others"


# Build the analysis data (plain data.frame)
dat_tbl <- dat_with_weights_stab_cons |>
  tidytable::slice_head(n = 1, .by = hash_key) |>
  tidytable::select(tidytable::all_of(c(vars_interest2, strata_var))) |>
  as.data.frame()

# 1) Ensure no duplicates and only existing cols
vars_interest2        <- unique(vars_interest2)
factorVars_interest2  <- unique(factorVars_interest2)
factorVars_interest2  <- intersect(factorVars_interest2, names(dat_tbl))  # keep only present vars

# (optional) warn if anything got dropped
# setdiff(factorVars_interest2_original, names(dat_tbl))

# 2) Create TableOne with clean inputs
tbone_interest2 <- tableone::CreateTableOne(
  vars       = vars_interest2,
  data       = dat_tbl,
  factorVars = factorVars_interest2,
  strata     = strata_var,
  addOverall = TRUE,
  includeNA  = TRUE,
  smd        = TRUE,
  test       = TRUE
)

# Make a composite key characteristic+level for ordering
df_tbone2 <- as.data.frame.TableOne(
  tbone_interest2,
  nonnormal = c("adm_age_rec3", "year_decimal", "treat_log_days"),
  smd = TRUE
)

# Build desired order as a tibble with both characteristic and level
order_df <- tibble::tribble(
  ~characteristic, ~level,
  "n", "",
  "Non-completion status of treatment (Dropout / Misspelled) (%)", "0",
  "Non-completion status of treatment (Dropout / Misspelled) (%)", "1",
  "Biopsychosocial compromise (Severe) (%)", "0",
  "Biopsychosocial compromise (Severe) (%)", "1",
  "Treatment duration (binary) (<90 days) (%)", "0",
  "Treatment duration (binary) (<90 days) (%)", "1",
  "Treatment duration (log-scaled days) (median [IQR])", "",
  "Age at admission to treatment (median [IQR])", "",
  "Birth year (median [IQR])", "",
  "Primary substance (initial diagnosis), cocaine powder (%)", "0",
  "Primary substance (initial diagnosis), cocaine powder (%)", "1",  
  "Primary substance (initial diagnosis), cocaine paste (%)", "0",
  "Primary substance (initial diagnosis), cocaine paste (%)", "1",  
  "Primary substance (initial diagnosis), marijuana (%)", "0",
  "Primary substance (initial diagnosis), marijuana (%)", "1",  
  "Primary substance (initial diagnosis), alcohol (%)", "0",
  "Primary substance (initial diagnosis), alcohol (%)", "1",  
  "Primary substance (initial diagnosis), others (%)", "0",
  "Primary substance (initial diagnosis), others (%)", "1",
  "Psychiatric comorbidity (ICD-10), In study (%)", "TRUE",
  "Psychiatric comorbidity (ICD-10), In study (%)", "FALSE",
  "Psychiatric comorbidity (ICD-10), Diagnosis (%)", "TRUE",
  "Psychiatric comorbidity (ICD-10), Diagnosis (%)", "FALSE",
  "Daily frequency of primary substance used at admission (%)", "0",
  "Daily frequency of primary substance used at admission (%)", "1",
  "Occupational Status, unemployed (%)", "0",
  "Occupational Status, unemployed (%)", "1",
  "Occupational Status, inactive (%)", "0",
  "Occupational Status, inactive (%)", "1",
  "Occupational Status, employed (%)", "0",
  "Occupational Status, employed (%)", "1",  
  "Primary substance at admission to treatment, cocaine powder (%)", "0",
  "Primary substance at admission to treatment, cocaine powder (%)", "1",
  "Primary substance at admission to treatment, cocaine paste (%)", "0",
  "Primary substance at admission to treatment, cocaine paste (%)", "1",
  "Primary substance at admission to treatment, marijuana (%)", "0",
  "Primary substance at admission to treatment, marijuana (%)", "1",
  "Primary substance at admission to treatment, alcohol (%)", "0",
  "Primary substance at admission to treatment, alcohol (%)", "1",
  "Primary substance at admission to treatment, others (%)", "0",
  "Primary substance at admission to treatment, others (%)", "1"
)

# Join to assign sort order
df_tbone_ordered2 <- df_tbone2 %>%
  left_join(order_df %>% mutate(order_id = dplyr::row_number()),
            by = c("characteristic", "level")) %>%
  arrange(order_id) %>%
  select(-order_id)

df_tbone_ordered2|> 
  #filter(level!=0, level!=FALSE)|>
  mutate(level=gsub("powder", "hydrochloride", level))|> 
  mutate(
      Overall = gsub("\\(\\s+", "(", Overall),  
      `0` = gsub("\\(\\s+", "(", `0`),
      `1` = gsub("\\(\\s+", "(", `1`)
  )|>
  select(characteristic, level, `0`, `1`, Overall, SMD)|>
  tibble::as_tibble()|> 
  (\(df) {
      format_cells(df,1, 1:length(names(df)), "bold")
  })() |> 
  knitr::kable("markdown")
Survival time
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  7.452  95.484 129.484 125.146 159.806 304.871 
Discard patients with only 1 treatment
characteristic level 0 1 Overall SMD
n 5956 21489 27445
Non-completion status of treatment (Dropout / Misspelled) (%) 0 1633 (27.4) 4580 (21.3) 6213 (22.6) 0.143
Non-completion status of treatment (Dropout / Misspelled) (%) 1 4323 (72.6) 16909 (78.7) 21232 (77.4)
Biopsychosocial compromise (Severe) (%) 0 4417 (74.2) 13094 (60.9) 17511 (63.8) 0.285
Biopsychosocial compromise (Severe) (%) 1 1539 (25.8) 8395 (39.1) 9934 (36.2)
Treatment duration (binary) (<90 days) (%) 0 4760 (79.9) 16160 (75.2) 20920 (76.2) 0.113
Treatment duration (binary) (<90 days) (%) 1 1196 (20.1) 5329 (24.8) 6525 (23.8)
Treatment duration (log-scaled days) (median [IQR]) 5.25 [4.65, 5.80] 5.13 [4.51, 5.70] 5.16 [4.53, 5.72] 0.146
Age at admission to treatment (median [IQR]) 38.56 [30.44, 47.58] 31.66 [26.12, 38.65] 32.82 [26.76, 40.69] 0.625
Birth year (median [IQR]) 1977.48 [1968.61, 1985.57] 1982.91 [1975.73, 1988.85] 1982.07 [1974.10, 1988.34] 0.492
Primary substance (initial diagnosis), marijuana (%) 0 5413 (90.9) 17635 (82.1) 23048 (84.0) 0.260
Primary substance (initial diagnosis), marijuana (%) 1 543 (9.1) 3854 (17.9) 4397 (16.0)
Primary substance (initial diagnosis), alcohol (%) 0 1876 (31.5) 7934 (36.9) 9810 (35.7) 0.115
Primary substance (initial diagnosis), alcohol (%) 1 4080 (68.5) 13555 (63.1) 17635 (64.3)
Primary substance (initial diagnosis), others (%) 0 5912 (99.3) 21408 (99.6) 27320 (99.5) 0.049
Primary substance (initial diagnosis), others (%) 1 44 (0.7) 81 (0.4) 125 (0.5)
Psychiatric comorbidity (ICD-10), In study (%) TRUE 862 (14.5) 3903 (18.2) 4765 (17.4)
Psychiatric comorbidity (ICD-10), In study (%) FALSE 5094 (85.5) 17586 (81.8) 22680 (82.6) 0.100
Psychiatric comorbidity (ICD-10), Diagnosis (%) TRUE 2638 (44.3) 10148 (47.2) 12786 (46.6)
Psychiatric comorbidity (ICD-10), Diagnosis (%) FALSE 3318 (55.7) 11341 (52.8) 14659 (53.4) 0.059
Daily frequency of primary substance used at admission (%) 0 3384 (56.8) 11078 (51.6) 14462 (52.7) 0.106
Daily frequency of primary substance used at admission (%) 1 2572 (43.2) 10411 (48.4) 12983 (47.3)
Occupational Status, unemployed (%) 0 4198 (70.5) 13226 (61.5) 17424 (63.5) 0.190
Occupational Status, unemployed (%) 1 1758 (29.5) 8263 (38.5) 10021 (36.5)
Occupational Status, inactive (%) 0 4716 (79.2) 17275 (80.4) 21991 (80.1) 0.030
Occupational Status, inactive (%) 1 1240 (20.8) 4214 (19.6) 5454 (19.9)
Occupational Status, employed (%) 0 2998 (50.3) 12477 (58.1) 15475 (56.4) 0.156
Occupational Status, employed (%) 1 2958 (49.7) 9012 (41.9) 11970 (43.6)
Primary substance at admission to treatment, marijuana (%) 0 5767 (96.8) 20106 (93.6) 25873 (94.3) 0.153
Primary substance at admission to treatment, marijuana (%) 1 189 (3.2) 1383 (6.4) 1572 (5.7)
Primary substance at admission to treatment, alcohol (%) 0 2818 (47.3) 17054 (79.4) 19872 (72.4) 0.705
Primary substance at admission to treatment, alcohol (%) 1 3138 (52.7) 4435 (20.6) 7573 (27.6)
Primary substance (initial diagnosis), cocaine hydrochloride (%) 0 5463 (91.7) 19805 (92.2) 25268 (92.1) 0.016
Primary substance (initial diagnosis), cocaine hydrochloride (%) 1 493 (8.3) 1684 (7.8) 2177 (7.9)
Primary substance (initial diagnosis), cocaine base paste (%) 0 5160 (86.6) 19174 (89.2) 24334 (88.7) 0.080
Primary substance (initial diagnosis), cocaine base paste (%) 1 796 (13.4) 2315 (10.8) 3111 (11.3)
Primary substance at admission to treatment, others (mean (SD)) 0.02 (0.13) 0.02 (0.13) 0.02 (0.13) 0.006
primary_sub_mod (%) cocaine paste 1720 (28.9) 10507 (48.9) 12227 (44.6) 0.710
primary_sub_mod (%) cocaine hydrochloride 810 (13.6) 4789 (22.3) 5599 (20.4)
primary_sub_mod (%) alcohol 3138 (52.7) 4435 (20.6) 7573 (27.6)
primary_sub_mod (%) marijuana 189 (3.2) 1383 (6.4) 1572 (5.7)
primary_sub_mod (%) others 99 (1.7) 375 (1.7) 474 (1.7)
Primary substance at admission to treatment, cocaine hydrochloride (%) 0 5146 (86.4) 16700 (77.7) 21846 (79.6) 0.228
Primary substance at admission to treatment, cocaine hydrochloride (%) 1 810 (13.6) 4789 (22.3) 5599 (20.4)
Primary substance at admission to treatment, cocaine base paste (%) 0 4236 (71.1) 10982 (51.1) 15218 (55.4) 0.420
Primary substance at admission to treatment, cocaine base paste (%) 1 1720 (28.9) 10507 (48.9) 12227 (44.6)
Code
# 0) received an average of 2.3 SUD treatments 
summary(dat_with_weights_stab_cons$max_treatment)
#  Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 2.000   2.000   2.000   2.796   3.000  13.000 

# 0) Prevalence
tidytable::slice_head(arrange(dat_with_weights_stab_cons, hash_key, adm_age_rec3),n = 1, .by = hash_key)|> janitor::tabyl(polysubstance_strict)
# polysubstance_strict     n   percent
#                   0  6543 0.2384041
#                   1 20902 0.7615959

# 1) Cross-tab: non-completion × setting
tab1 <- table(tidytable::slice_head(dat_with_weights_stab_cons,n = 1, .by = hash_key)$plan_type_mod,
tidytable::slice_head(dat_with_weights_stab_cons,n = 1, .by = hash_key)$tr_noncompletion)

# 2) Cross-tab: PSU × setting (assuming PSU is in sex_rec or another column)
# e.g., suppose sex_rec actually encodes PSU vs single-use:
tab2 <- table(tidytable::slice_head(dat_with_weights_stab_cons,n = 1, .by = hash_key)$plan_type_mod,
tidytable::slice_head(dat_with_weights_stab_cons,n = 1, .by = hash_key)$sex_rec)

format_chisq <- function(test){
  stat <- round(unname(test$statistic), 2)
  df   <- unname(test$parameter)
  p    <- test$p.value
  
  # P-value formatting
  p_str <- ifelse(p < .001, "<.001", sprintf("=%.3f", p))
  
  glue::glue("χ²({df}) = {stat}, P {p_str}")
}

format_residuals <- function(resid_mat, cutoff = 2) {
  # Two-sided p-values from residuals ~ N(0,1)
  pvals <- 2 * (1 - pnorm(abs(resid_mat)))
  
  as.data.frame(as.table(resid_mat)) |>
    dplyr::mutate(
      pval = as.vector(pvals),
      signif = sprintf("%.2f", round(pval, 3)),
      Residual = sprintf("%.2f", Freq)
    ) |>
    dplyr::filter(abs(as.numeric(Residual)) >= cutoff) |>
    dplyr::select(Var1, Var2, Residual, signif)
}

# χ²(4) = 400.46, P <.001
# χ²(4) = 10680.52, P <.001
# 
#                      Var1 Var2 Residual signif
# 1     GP basic ambulatory    0    -3.55   0.00
# 2 GP intensive ambulatory    0    -4.96   0.00
# 3          GP residential    0    16.17   0.00
# 4 WO intensive ambulatory    0    -3.64   0.00
# 5          WO residential    0     4.17   0.00
# 6 GP intensive ambulatory    1     2.30   0.02
# 7          GP residential    1    -7.51   0.00
#                       Var1   Var2 Residual signif
# 1      GP basic ambulatory hombre     9.33   0.00
# 2  GP intensive ambulatory hombre    11.85   0.00
# 3           GP residential hombre    17.95   0.00
# 4  WO intensive ambulatory hombre   -40.54   0.00
# 5           WO residential hombre   -33.60   0.00
# 6      GP basic ambulatory  mujer   -13.89   0.00
# 7  GP intensive ambulatory  mujer   -17.63   0.00
# 8           GP residential  mujer   -26.71   0.00
# 9  WO intensive ambulatory  mujer    60.34   0.00
# 10          WO residential  mujer    50.01   0.00

format_residuals(chisq.test(tab1)$residuals) |> knitr::kable("markdown", caption= paste0("non-completion × setting (", format_chisq(chisq.test(tab1)),")"))

format_residuals(chisq.test(tab2)$residuals) |> knitr::kable("markdown", caption= paste0("PSU x setting (", format_chisq(chisq.test(tab2)),")"))

woolf_manual <- function(tab3d) {
  k <- dim(tab3d)[3]
  
  logOR <- var_logOR <- numeric(k)
  
  for (j in 1:k) {
    a <- tab3d[1,1,j]; b <- tab3d[1,2,j]
    c <- tab3d[2,1,j]; d <- tab3d[2,2,j]
    
    logOR[j]     <- log((a*d)/(b*c))
    var_logOR[j] <- 1/a + 1/b + 1/c + 1/d
  }
  
  w <- 1/var_logOR
  theta_bar <- sum(w * logOR) / sum(w)
  
  X2  <- sum(w * (logOR - theta_bar)^2)
  df  <- k - 1
  p   <- 1 - pchisq(X2, df)
  
  list(statistic = X2, df = df, p.value = p,
       logOR = logOR, OR = exp(logOR), var = var_logOR)
}

# Example with your data:
tab <- xtabs(~ polysubstance_strict + tr_noncompletion + plan_type_mod, 
data = tidytable::slice_head(dat_with_weights_stab_cons,n = 1, .by = hash_key))
woolf_manual(tab)

# $statistic
# [1] 55.8
# 
# $df
# [1] 4
# 
# $p.value
# [1] 2.18e-11

# $statistic
# [1] 24.37842
# 
# $df
# [1] 4
# 
# $p.value
# [1] 0.0000670675

assoc_by_stratum <- dat_with_weights_stab_cons %>%
    split(.$plan_type_mod) %>%
    map(~ {
        tab <- table(.x$polysubstance_strict, .x$tr_noncompletion)
        test <- suppressWarnings(chisq.test(tab))  # χ² test
        
        stat <- unname(test$statistic)
        df   <- unname(test$parameter)
        pval <- test$p.value
        
        # Format nicely
        p_str <- ifelse(pval < .001, "<.001", sprintf("=%.2f", pval))
        
        tibble::tibble(
            plan_type_mod = unique(.x$plan_type_mod),
            chisq_string  = sprintf("χ²(%d) = %.2f, P %s", df, stat, p_str)
        )
    }) %>%
    bind_rows()

#   plan_type_mod           chisq_string           
#   <chr>                   <chr>                  
# 1 GP basic ambulatory     χ²(1) = 335.27, P <.001
# 2 GP intensive ambulatory χ²(1) = 241.82, P <.001
# 3 GP residential          χ²(1) = 22.78, P <.001 
# 4 WO intensive ambulatory χ²(1) = 32.19, P <.001 
# 5 WO residential          χ²(1) = 19.40, P <.001 

# 1 GP basic ambulatory     χ²(1) = 314.19, P <.001
# 2 GP intensive ambulatory χ²(1) = 216.23, P <.001
# 3 GP residential          χ²(1) = 16.22, P <.001 
# 4 WO intensive ambulatory χ²(1) = 27.09, P <.001 
# 5 WO residential          χ²(1) = 14.88, P <.001 

knitr::kable(assoc_by_stratum, "markdown", caption= "Association between noncompletion and PSU")
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  2.000   2.000   2.000   2.796   3.000  13.000 
 polysubstance_strict     n   percent
                    0  5956 0.2170158
                    1 21489 0.7829842
non-completion × setting (χ²(4) = 290.8, P <.001)
Var1 Var2 Residual signif
GP basic ambulatory 0 -6.00 0.00
GP intensive ambulatory 0 -3.44 0.00
GP residential 0 12.70 0.00
WO residential 0 3.72 0.00
GP basic ambulatory 1 3.24 0.00
GP residential 1 -6.87 0.00
WO residential 1 -2.01 0.04
PSU x setting (χ²(4) = 10622.89, P <.001)
Var1 Var2 Residual signif
GP basic ambulatory hombre 9.05 0.00
GP intensive ambulatory hombre 10.91 0.00
GP residential hombre 19.12 0.00
WO intensive ambulatory hombre -35.58 0.00
WO residential hombre -37.45 0.00
GP basic ambulatory mujer -13.68 0.00
GP intensive ambulatory mujer -16.49 0.00
GP residential mujer -28.89 0.00
WO intensive ambulatory mujer 53.76 0.00
WO residential mujer 56.60 0.00
$statistic
[1] 14.89707

$df
[1] 4

$p.value
[1] 0.004919527

$logOR
[1] 0.5419830 0.3926138 0.1484303 0.2906816 0.2998210

$OR
[1] 1.719413 1.480846 1.160012 1.337339 1.349617

$var
[1] 0.003268931 0.003143068 0.008553028 0.017393284 0.016000554
Association between noncompletion and PSU
plan_type_mod chisq_string
GP basic ambulatory χ²(1) = 242.25, P <.001
GP intensive ambulatory χ²(1) = 163.65, P <.001
GP residential χ²(1) = 13.58, P <.001
WO intensive ambulatory χ²(1) = 22.12, P <.001
WO residential χ²(1) = 58.00, P <.001

The association between PSU and treatment non-completion varied significantly across treatment settings [Woolf test: χ²(4)=55.8, P<.001]. At the stratum level, PSU was consistently associated with increased odds of non-completion, but the strength differed: strongest in general-population basic ambulatory (OR≈1.89, χ²(1)=335.3, P<.001) and intensive ambulatory (OR≈1.60, χ²(1)=241.8, P<.001), and more modest in residential settings (GP: OR≈1.26, χ²(1)=22.8, P<.001; WO: OR≈1.34, χ²(1)=19.4, P<.001).

Code
dat_with_weights_stab_cons %>%
    tidytable::group_by(id) %>%
    tidytable::mutate(max_treatment = tidytable::n()) %>%
    tidytable::mutate(
        psu_any = as.integer(any(polysubstance_strict == 1, na.rm = TRUE))
    ) %>%
    tidytable::group_by(max_treatment) %>%
    tidytable::summarise(
        n = tidytable::n(),
        psu_rate = mean(psu_any),
        .by = max_treatment
    ) %>%
    knitr::kable("markdown", caption= "Proportion of patients with PSU, by no. of treatments")

dat_with_weights_stab_cons %>%
    tidytable::arrange(id, treatment) %>%
    tidytable::group_by(id) %>%
    tidytable::filter(tidytable::n() >= 2) %>%
    tidytable::summarise(
        psu_adm = first(polysubstance_strict),
        psu_any = as.integer(any(polysubstance_strict == 1, na.rm = TRUE)),
        psu_later = as.integer(
            any(polysubstance_strict == 1 & row_number() > 1, na.rm = TRUE)
        )
    ) %>%
    tidytable::summarise(
        total = tidytable::n(),
        adm_1 = sum(psu_adm == 1),
        any_1 = sum(psu_any == 1),
        later_only = sum(psu_adm == 0 & psu_any == 1),
        .by = NULL
    ) |> 
    knitr::kable(
    format   = "markdown",
    caption  = "Summary of PSU across repeated treatments",
    col.names= c("Total patients",
                 "PSU at admission",
                 "PSU at any treatment episode",
                 "PSU, later-only")
  )
# | Total patients| PSU at admission| PSU at any treatment episode| PSU, later-only|
# |--------------:|----------------:|----------------------------:|---------------:|
# |          27445|            21489|                        24231|            2742|
Proportion of patients with PSU, by no. of treatments
max_treatment n psu_rate
2 38040 0.8590957
3 16554 0.9209859
4 7344 0.9542484
5 3275 0.9816794
6 1596 0.9924812
7 602 1.0000000
8 320 1.0000000
9 153 1.0000000
10 60 1.0000000
13 13 1.0000000
Summary of PSU across repeated treatments
Total patients PSU at admission PSU at any treatment episode PSU, later-only
27445 21489 24231 2742

People with 2+ treatments have higher PSU rates (85.5%–100%) than those with only 1 treatment (64.2%).

Table 2

Code
# 0) Optional time window restriction if needed (keeps records within 2010-2019)
#     Comment out if dat is already restricted.

# 1) Person-level cohort: multiple SUD treatments
px <- dat_with_weights_stab_cons %>%
  tidytable::ungroup() %>%
  tidytable::arrange(id, treatment) %>%
  tidytable::group_by(id) %>%
  tidytable::summarise(
    # PSU flags
    psu_adm = first(polysubstance_strict),       # PSU at first treatment
    psu_any = as.integer(any(polysubstance_strict == 1, na.rm = TRUE)),  # PSU in any treatment

    # Non-completion events
    nc_first = first(tr_noncompletion),          # non-completion at first treatment (0/1)
    nc_any   = as.integer(any(tr_noncompletion == 1, na.rm = TRUE)),     # ≥1 non-completion

    # Follow-up time in months (person-months)
    pt = suppressWarnings(first(cens_time))      # person-level follow-up time
  ) %>%
  tidytable::ungroup() %>%
  tidytable::mutate(
    psu_adm_lbl = tidytable::if_else(psu_adm == 1, "Reported", "Not reported"),
    psu_any_lbl = tidytable::if_else(psu_any == 1, "Reported", "Not reported")
  )

# Helper to compute IR and normal-approx 95% CI per 1,000 person-months
summarise_block <- function(df, group_var, event_col) {
  df %>%
    tidytable::summarise(
      n = tidytable::n(),
      prevalence = mean({{ event_col }} == 1, na.rm = TRUE),
      follow_up_time = sum(pt, na.rm = TRUE),
      events = sum({{ event_col }}, na.rm = TRUE),
      .by = {{ group_var }}
    ) %>%
    tidytable::mutate(
      IR  = 1000 * events / follow_up_time,
      se  = 1000 * sqrt(pmax(events, 1e-9)) / follow_up_time,
      lcl = IR - 1.96 * se,
      ucl = IR + 1.96 * se,
      `IR (95% CI)`   = sprintf("%.2f (%.2f, %.2f)", IR, lcl, ucl),
      Prevalence      = scales::percent(prevalence, accuracy = 1),
      `Follow-up time`= scales::number(follow_up_time, accuracy = 0.01, big.mark = ","),
      `Polysubstance use` = paste0({{ group_var }}, " (n= ", scales::comma(n), ")")
    ) %>%
    tidytable::select(
      `Polysubstance use`, Prevalence, `Follow-up time`, Events = events, `IR (95% CI)`
    )
}

# 2) Four table panels
# A) PSU at admission AND at least one event of non-completion
tab_A <- summarise_block(px, psu_adm_lbl, nc_any) %>%
  mutate(Type = "PSU at admission and at least one event of non-completion")

# B) PSU at admission AND non-completion at first treatment
tab_B <- summarise_block(px, psu_adm_lbl, nc_first) %>%
  mutate(Type = "PSU at admission and non-completion at first treatment")

# C) ≥1 treatment reporting PSU AND ≥1 event of non-completion
tab_C <- summarise_block(px, psu_any_lbl, nc_any) %>%
  mutate(Type = "At least one treatment reporting PSU and at least one event of non-completion")

# D) ≥1 treatment reporting PSU AND non-completion at first treatment
tab_D <- summarise_block(px, psu_any_lbl, nc_first) %>%
  mutate(Type = "At least one treatment reporting PSU and non-completion at the first treatment")

# 3) Bind and order like Table 2
table_2 <- bind_rows(tab_A, tab_B, tab_C, tab_D) %>%
  tidytable::mutate(
    Type = factor(
      Type,
      c(
        "PSU at admission and at least one event of non-completion",
        "PSU at admission and non-completion at first treatment",
        "At least one treatment reporting PSU and at least one event of non-completion",
        "At least one treatment reporting PSU and non-completion at the first treatment"
      )
    ),
    grp_ord = factor(gsub(" \\(n=.*", "", `Polysubstance use`),
                     levels = c("Not reported", "Reported"))
  ) %>%
  tidytable::arrange(Type, grp_ord) %>%
  tidytable::select(-grp_ord)

# 4) Print
knitr::kable(
  table_2,
  align = "lrrrr",
  col.names = c("type","Polysubstance use", "Prevalence", "Follow-up time", "Events", "IR (95% CI)")
)

# type  Polysubstance use   Prevalence  Follow-up time  Events  IR (95% CI)
# Not reported (n= 6,909)   91% 734,496.81  6314    8.60 (8.38, 8.81)   PSU at admission and at least one event of non-completion
# Reported (n= 24,360)  95% 2,948,798.17    23163   7.86 (7.75, 7.96)   PSU at admission and at least one event of non-completion
# Not reported (n= 6,909)   79% 734,496.81  5429    7.39 (7.19, 7.59)   PSU at admission and non-completion at first treatment
# Reported (n= 24,360)  83% 2,948,798.17    20293   6.88 (6.79, 6.98)   PSU at admission and non-completion at first treatment
# Not reported (n= 3,600)   87% 364,607.66  3145    8.63 (8.32, 8.93)   At least one treatment reporting PSU and at least one event of non-completion
# Reported (n= 27,669)  95% 3,318,687.31    26332   7.93 (7.84, 8.03)   At least one treatment reporting PSU and at least one event of non-completion
# Not reported (n= 3,600)   74% 364,607.66  2650    7.27 (6.99, 7.54)   At least one treatment reporting PSU and non-completion at the first treatment
# Reported (n= 27,669)  83% 3,318,687.31    23072   6.95 (6.86, 7.04)   At least one treatment reporting PSU and non-completion at the first treatment
type Polysubstance use Prevalence Follow-up time Events IR (95% CI)
Not reported (n= 5,956) 88% 654,592.44 5268 8.05 (7.83, 8.27) PSU at admission and at least one event of non-completion
Reported (n= 21,489) 93% 2,686,018.20 20001 7.45 (7.34, 7.55) PSU at admission and at least one event of non-completion
Not reported (n= 5,956) 73% 654,592.44 4323 6.60 (6.41, 6.80) PSU at admission and non-completion at first treatment
Reported (n= 21,489) 79% 2,686,018.20 16909 6.30 (6.20, 6.39) PSU at admission and non-completion at first treatment
Not reported (n= 3,214) 84% 336,381.49 2702 8.03 (7.73, 8.34) At least one treatment reporting PSU and at least one event of non-completion
Reported (n= 24,231) 93% 3,004,229.15 22567 7.51 (7.41, 7.61) At least one treatment reporting PSU and at least one event of non-completion
Not reported (n= 3,214) 67% 336,381.49 2165 6.44 (6.17, 6.71) At least one treatment reporting PSU and non-completion at the first treatment
Reported (n= 24,231) 79% 3,004,229.15 19067 6.35 (6.26, 6.44) At least one treatment reporting PSU and non-completion at the first treatment

Stratify by plan type

Code
cat("percentage of women by plan type (women, vs. gp)\n")
subset(dat_with_weights_stab_cons, treatment==1, c("sex_rec","plan_type_mod")) |> mutate(wo=grepl("WO",plan_type_mod)) |> janitor::tabyl(sex_rec, wo) |> janitor::adorn_percentages("col")
 # sex_rec     FALSE TRUE
 #  hombre 0.8084971    0
 #   mujer 0.1915029    1

tab3<- 
split(dat_with_weights_stab_cons, dat_with_weights_stab_cons$plan_type_mod)|>
  purrr::imap(~{
    df <- .x|> dplyr::mutate(id = as.character(id))

    # Clean formula: NO cluster()/strata()/id inside
    ff_local <- tr_noncompletion ~ polysubstance_strict +
      age_c10 + year_c +
      susinidum_oh + susinidum_coc + susinidum_pbc + susinidum_mar +
      dg_psiq_cie_10_instudy + dg_psiq_cie_10_dg +
      prim_sub_daily + occ24_inactive + occ24_unemployed +
      susprindum_coc + susprindum_pbc + susprindum_mar

    # Freeze row universe once
    mf0  <- stats::model.frame(ff_local, data = df, na.action = stats::na.pass)
    keep <- stats::complete.cases(mf0)
    mf   <- mf0[keep, , drop = FALSE]
    if (nrow(mf) == 0L) return(NULL)

    # Outcome variation check
    y <- stats::model.response(mf)
    if (length(unique(y)) < 2L) {
      return(tibble::tibble(
        plan_type_mod = as.character(.y),
        n_rows = nrow(mf),
        term = "(skipped: no outcome variation)",
        estimate = NA_real_, conf.low = NA_real_, conf.high = NA_real_
      ))
    }

    # Design matrix and term mapping
    X <- stats::model.matrix(ff_local, mf)
    assign_vec <- attr(X, "assign")                # maps columns -> term index
    term_labels <- attr(stats::terms(ff_local), "term.labels")

    # 1) Drop constant columns (except intercept)
    const <- vapply(as.data.frame(X), function(v) length(unique(v)) == 1L, logical(1))
    const[colnames(X) == "(Intercept)"] <- FALSE
    if (any(const)) {
      X <- X[, !const, drop = FALSE]
      assign_vec <- assign_vec[!const]
    }

    # 2) Drop collinear columns via QR
    qrX <- qr(X)
    keep_cols <- sort(qrX$pivot[seq_len(qrX$rank)])
    X_reduced <- X[, keep_cols, drop = FALSE]
    assign_kept <- assign_vec[keep_cols]

    # 3) Map kept columns back to ORIGINAL term labels
    kept_term_idx <- sort(unique(assign_kept[assign_kept > 0]))  # >0 skips intercept
    if (length(kept_term_idx) == 0L) {
      return(tibble::tibble(
        plan_type_mod = as.character(.y),
        n_rows = nrow(mf),
        term = "(no predictors after cleanup)",
        estimate = NA_real_, conf.low = NA_real_, conf.high = NA_real_
      ))
    }
    kept_terms <- term_labels[kept_term_idx]

    # 4) Rebuild reduced formula from ORIGINAL terms
    ff_reduced <- stats::reformulate(kept_terms, response = "tr_noncompletion")

    # 5) Align id to kept rows (same `keep` mask)
    id_vec <- df$id[keep]

    # 6) Fit GEE (intercept-only scale)
    fit <- geepack::geeglm(
      ff_reduced,
      id        = id_vec,
      data      = mf,
      family    = poisson,
      corstr    = "independence",
      scale.fix = TRUE,
      na.action = stats::na.exclude
      #, weights = df$w0_tr[keep]   # or w1_tr, if you want weights by stratum
    )

    broom::tidy(fit, exponentiate = TRUE, conf.int = TRUE)|>
      dplyr::transmute(
        plan_type_mod = as.character(.y),
        n_rows        = nrow(mf),
        term,
        `RR (95% CI)` = sprintf("%.4f (%.4f, %.4f)", estimate, conf.low, conf.high)
      )
  })|>
  dplyr::bind_rows()|>
  dplyr::relocate(plan_type_mod, n_rows)

tab3_w0<- 
split(dat_with_weights_stab_cons, dat$plan_type_mod)|>
  purrr::imap(~{
    df <- .x|> dplyr::mutate(id = as.character(id))

    # Clean formula: NO cluster()/strata()/id inside
    ff_local <- tr_noncompletion ~ polysubstance_strict +
      age_c10 + year_c +
      susinidum_oh + susinidum_coc + susinidum_pbc + susinidum_mar +
      dg_psiq_cie_10_instudy + dg_psiq_cie_10_dg +
      prim_sub_daily + occ24_inactive + occ24_unemployed +
      susprindum_coc + susprindum_pbc + susprindum_mar

    # Freeze row universe once
    mf0  <- stats::model.frame(ff_local, data = df, na.action = stats::na.pass)
    keep <- stats::complete.cases(mf0)
    mf   <- mf0[keep, , drop = FALSE]
    if (nrow(mf) == 0L) return(NULL)

    # Outcome variation check
    y <- stats::model.response(mf)
    if (length(unique(y)) < 2L) {
      return(tibble::tibble(
        plan_type_mod = as.character(.y),
        n_rows = nrow(mf),
        term = "(skipped: no outcome variation)",
        estimate = NA_real_, conf.low = NA_real_, conf.high = NA_real_
      ))
    }

    # Design matrix and term mapping
    X <- stats::model.matrix(ff_local, mf)
    assign_vec <- attr(X, "assign")                # maps columns -> term index
    term_labels <- attr(stats::terms(ff_local), "term.labels")

    # 1) Drop constant columns (except intercept)
    const <- vapply(as.data.frame(X), function(v) length(unique(v)) == 1L, logical(1))
    const[colnames(X) == "(Intercept)"] <- FALSE
    if (any(const)) {
      X <- X[, !const, drop = FALSE]
      assign_vec <- assign_vec[!const]
    }

    # 2) Drop collinear columns via QR
    qrX <- qr(X)
    keep_cols <- sort(qrX$pivot[seq_len(qrX$rank)])
    X_reduced <- X[, keep_cols, drop = FALSE]
    assign_kept <- assign_vec[keep_cols]

    # 3) Map kept columns back to ORIGINAL term labels
    kept_term_idx <- sort(unique(assign_kept[assign_kept > 0]))  # >0 skips intercept
    if (length(kept_term_idx) == 0L) {
      return(tibble::tibble(
        plan_type_mod = as.character(.y),
        n_rows = nrow(mf),
        term = "(no predictors after cleanup)",
        estimate = NA_real_, conf.low = NA_real_, conf.high = NA_real_
      ))
    }
    kept_terms <- term_labels[kept_term_idx]

    # 4) Rebuild reduced formula from ORIGINAL terms
    ff_reduced <- stats::reformulate(kept_terms, response = "tr_noncompletion")

    # 5) Align id to kept rows (same `keep` mask)
    id_vec <- df$id[keep]

    # 6) Fit GEE (intercept-only scale)
    fit <- geepack::geeglm(
      ff_reduced,
      id        = id_vec,
      data      = mf,
      family    = poisson,
      corstr    = "independence",
      scale.fix = TRUE,
      na.action = stats::na.exclude
      , weights = df$w0_tr[keep]   # or w1_tr, if you want weights by stratum
    )

    broom::tidy(fit, exponentiate = TRUE, conf.int = TRUE)|>
      dplyr::transmute(
        plan_type_mod = as.character(.y),
        n_rows        = nrow(mf),
        term,
        `RR (95% CI)` = sprintf("%.4f (%.4f, %.4f)", estimate, conf.low, conf.high)
      )
  })|>
  dplyr::bind_rows()|>
  dplyr::relocate(plan_type_mod, n_rows)



tab3_w1<- 
split(dat_with_weights_stab_cons, dat$plan_type_mod)|>
  purrr::imap(~{
    df <- .x|> dplyr::mutate(id = as.character(id))

    # Clean formula: NO cluster()/strata()/id inside
    ff_local <- tr_noncompletion ~ polysubstance_strict +
      age_c10 + year_c +
      susinidum_oh + susinidum_coc + susinidum_pbc + susinidum_mar +
      dg_psiq_cie_10_instudy + dg_psiq_cie_10_dg +
      prim_sub_daily + occ24_inactive + occ24_unemployed +
      susprindum_coc + susprindum_pbc + susprindum_mar

    # Freeze row universe once
    mf0  <- stats::model.frame(ff_local, data = df, na.action = stats::na.pass)
    keep <- stats::complete.cases(mf0)
    mf   <- mf0[keep, , drop = FALSE]
    if (nrow(mf) == 0L) return(NULL)

    # Outcome variation check
    y <- stats::model.response(mf)
    if (length(unique(y)) < 2L) {
      return(tibble::tibble(
        plan_type_mod = as.character(.y),
        n_rows = nrow(mf),
        term = "(skipped: no outcome variation)",
        estimate = NA_real_, conf.low = NA_real_, conf.high = NA_real_
      ))
    }

    # Design matrix and term mapping
    X <- stats::model.matrix(ff_local, mf)
    assign_vec <- attr(X, "assign")                # maps columns -> term index
    term_labels <- attr(stats::terms(ff_local), "term.labels")

    # 1) Drop constant columns (except intercept)
    const <- vapply(as.data.frame(X), function(v) length(unique(v)) == 1L, logical(1))
    const[colnames(X) == "(Intercept)"] <- FALSE
    if (any(const)) {
      X <- X[, !const, drop = FALSE]
      assign_vec <- assign_vec[!const]
    }

    # 2) Drop collinear columns via QR
    qrX <- qr(X)
    keep_cols <- sort(qrX$pivot[seq_len(qrX$rank)])
    X_reduced <- X[, keep_cols, drop = FALSE]
    assign_kept <- assign_vec[keep_cols]

    # 3) Map kept columns back to ORIGINAL term labels
    kept_term_idx <- sort(unique(assign_kept[assign_kept > 0]))  # >0 skips intercept
    if (length(kept_term_idx) == 0L) {
      return(tibble::tibble(
        plan_type_mod = as.character(.y),
        n_rows = nrow(mf),
        term = "(no predictors after cleanup)",
        estimate = NA_real_, conf.low = NA_real_, conf.high = NA_real_
      ))
    }
    kept_terms <- term_labels[kept_term_idx]

    # 4) Rebuild reduced formula from ORIGINAL terms
    ff_reduced <- stats::reformulate(kept_terms, response = "tr_noncompletion")

    # 5) Align id to kept rows (same `keep` mask)
    id_vec <- df$id[keep]

    # 6) Fit GEE (intercept-only scale)
    fit <- geepack::geeglm(
      ff_reduced,
      id        = id_vec,
      data      = mf,
      family    = poisson,
      corstr    = "independence",
      scale.fix = TRUE,
      na.action = stats::na.exclude
      , weights = df$w1_tr[keep]   # or w1_tr, if you want weights by stratum
    )

    broom::tidy(fit, exponentiate = TRUE, conf.int = TRUE)|>
      dplyr::transmute(
        plan_type_mod = as.character(.y),
        n_rows        = nrow(mf),
        term,
        `RR (95% CI)` = sprintf("%.4f (%.4f, %.4f)", estimate, conf.low, conf.high)
      )
  })|>
  dplyr::bind_rows()|>
  dplyr::relocate(plan_type_mod, n_rows)

format_ci <- function(x) {
    m <- stringr::str_match(x, "([0-9.]+) \\(([^,]+), ([^)]+)\\)")
    est <- as.numeric(m[,2])
    lo  <- as.numeric(m[,3])
    hi  <- as.numeric(m[,4])
    sprintf("%.2f (%.2f, %.2f)", est, lo, hi)
}

 rbind.data.frame(
  cbind.data.frame(w="no weight", filter(tab3, grepl("poly", term))),
  cbind.data.frame(w="weight lag=0", filter(tab3_w0, grepl("poly", term))),
  cbind.data.frame(w="weight lag=1", filter(tab3_w1, grepl("poly", term)))
  )|> 
   pivot_wider(
    names_from = w,
    values_from = `RR (95% CI)`
  )|>
  relocate(plan_type_mod, n_rows, term)|> 
   select(-term)|> 
  (\(df) {
    cat(paste0("Total cases included in the model: ",
               formatC(summarise(df, sum_n= sum(n_rows)) |> pull(sum_n), big.mark = ","), "\n"))
     utils::write.csv(df, file.path(wdpath, "data/20241015_out/psu", "stratified_results.csv"), row.names = FALSE)
     df ->> stratified_results
    df
  })()|>
    mutate(across(
    c(`no weight`, `weight lag=0`, `weight lag=1`),
    format_ci
  ))|> 
  knitr::kable("markdown", digits = 2)
 
# |plan_type_mod           | n_rows|no weight         |weight lag=0      |weight lag=1      |
# |:-----------------------|------:|:-----------------|:-----------------|:-----------------|
# |GP basic ambulatory     |  20427|1.30 (1.20, 1.40) |1.30 (1.19, 1.41) |1.27 (1.16, 1.38) |
# |GP intensive ambulatory |  33782|1.21 (1.13, 1.29) |1.21 (1.13, 1.31) |1.17 (1.09, 1.26) |
# |GP residential          |  13991|0.96 (0.86, 1.08) |0.98 (0.86, 1.10) |0.95 (0.84, 1.07) |
# |WO intensive ambulatory |   6476|1.24 (1.07, 1.43) |1.21 (1.04, 1.42) |1.26 (1.08, 1.47) |
# |WO residential          |   6733|1.14 (0.98, 1.32) |1.10 (0.93, 1.29) |1.14 (0.97, 1.35) |
#67,957
percentage of women by plan type (women, vs. gp)
 sex_rec     FALSE TRUE
  hombre 0.8084971    0
   mujer 0.1915029    1
Total cases included in the model: 67,957
plan_type_mod n_rows no weight weight lag=0 weight lag=1
GP basic ambulatory 20753 1.05 (1.03, 1.07) 1.06 (1.04, 1.08) 1.06 (1.03, 1.08)
GP intensive ambulatory 26825 1.04 (1.02, 1.06) 1.04 (1.02, 1.06) 1.04 (1.02, 1.06)
GP residential 10410 1.02 (0.99, 1.06) 1.03 (0.99, 1.07) 1.01 (0.97, 1.05)
WO intensive ambulatory 4557 1.04 (1.00, 1.09) 1.05 (1.00, 1.10) 1.05 (1.00, 1.09)
WO residential 5412 1.12 (1.07, 1.17) 1.13 (1.07, 1.18) 1.13 (1.07, 1.18)
Code
library(metafor)

Cargando paquete requerido: Matrix

Adjuntando el paquete: ‘Matrix’

The following object is masked from ‘package:tidytable’:

expand

Cargando paquete requerido: metadat

Cargando paquete requerido: numDeriv

Loading the ‘metafor’ package (version 4.8-0). For an introduction to the package please type: help(metafor)

Code
# Helper to extract numbers
parse_ci <- function(x) {
  m <- stringr::str_match(x, "([0-9.]+) \\(([^,]+), ([^)]+)\\)")
  tibble::tibble(
    est   = as.numeric(m[,2]),
    ci_lo = as.numeric(m[,3]),
    ci_hi = as.numeric(m[,4])
  )
}

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
##_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
# Reshape into long format & parse
df_long <- as.data.frame(stratified_results) %>%
  pivot_longer(
    cols = c(`no weight`, `weight lag=0`, `weight lag=1`),
    names_to = "model", values_to = "res"
  ) %>%
  mutate(parsed = lapply(res, parse_ci)) %>%
  tidyr::unnest_wider(parsed) %>%   # <-- force tidyr version
  mutate(
    se = (ci_hi - ci_lo)/(2*1.96),
    yi = log(est)
  )


# Run Cochrane’s Q (heterogeneity) per plan_type_mod
cochran_by_weights <- df_long %>%
    dplyr::group_by(model) %>%
    dplyr::group_map(~{
        r <- rma.uni(yi = .x$yi, sei = .x$se, method = "FE")
        tibble::tibble(
            model      = .y$model,
            Q          = r$QE,
            df         = r$k - 1,
            pval       = r$QEp,
            pooled_est = exp(r$b),
            pooled_ci  = paste0(
                round(exp(r$ci.lb), 2), " – ", round(exp(r$ci.ub), 2)
            )
        )
    }) %>%
    dplyr::bind_rows()

cochran_by_weights |> 
  knitr::kable("markdown", caption= "Heterogeneity by plan type")
Heterogeneity by plan type
model Q df pval pooled_est pooled_ci
no weight 9.471652 4 0.0503330 1.044810 1.03 – 1.06
weight lag=0 7.843235 4 0.0974924 1.053093 1.04 – 1.07
weight lag=1 12.103026 4 0.0166014 1.047039 1.03 – 1.06


Session info

Code
#|echo: true
#|error: true
#|message: true
#|paged.print: true
message(paste0("R library: ", Sys.getenv("R_LIBS_USER")))

R library: G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32

Code
message(paste0("Date: ",withr::with_locale(new = c('LC_TIME' = 'C'), code =Sys.time())))

Date: 2025-09-25 20:11:04.6968

Code
message(paste0("Editor context: ", path))

Error: objeto ‘path’ no encontrado

Code
cat("quarto version: "); quarto::quarto_version()
quarto version: 
[1] '1.7.29'
Code
sesion_info <- devtools::session_info()

Warning in system2(“quarto”, “-V”, stdout = TRUE, env = paste0(“TMPDIR=”, : el comando ejecutado ‘“quarto” TMPDIR=C:/Users/andre/AppData/Local/Temp/RtmpOWXeQI/file1d8c7aca2493 -V’ tiene el estatus 1

Code
dplyr::select(
  tibble::as_tibble(sesion_info$packages),
  c(package, loadedversion, source)
)|> 
  DT::datatable(filter = 'top', colnames = c('Row number' =1,'Package' = 2, 'Version'= 3),
              caption = htmltools::tags$caption(
        style = 'caption-side: top; text-align: left;',
        '', htmltools::em('R packages')),
      options=list(
initComplete = htmlwidgets::JS(
        "function(settings, json) {",
        "$(this.api().tables().body()).css({
            'font-family': 'Helvetica Neue',
            'font-size': '70%', 
            'code-inline-font-size': '15%', 
            'white-space': 'nowrap',
            'line-height': '0.75em',
            'min-height': '0.5em'
            });",
        "}")))
Code
#|echo: true
#|error: true
#|message: true
#|paged.print: true
#|class-output: center-table

reticulate::py_list_packages()|> 
  DT::datatable(filter = 'top', colnames = c('Row number' =1,'Package' = 2, 'Version'= 3),
              caption = htmltools::tags$caption(
        style = 'caption-side: top; text-align: left;',
        '', htmltools::em('Python packages')),
      options=list(
initComplete = htmlwidgets::JS(
        "function(settings, json) {",
        "$(this.api().tables().body()).css({
            'font-family': 'Helvetica Neue',
            'font-size': '70%', 
            'code-inline-font-size': '15%', 
            'white-space': 'nowrap',
            'line-height': '0.75em',
            'min-height': '0.5em'
            });",
        "}"))) 

Warning in system2(python, args, stdout = TRUE): el comando ejecutado ‘“G:/My Drive/Alvacast/SISTRAT 2023/.mamba_root/envs/py311/python.exe” -m pip freeze’ tiene el estatus 1

Save

Code
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}

paste0(getwd(),"/cons")
file.path(paste0(wdpath,"data/20241015_out"))
file.path(paste0(envpath,"data/20241015_out"))

# Save
rdata_path <- file.path(wdpath, "data/20241015_out/psu", paste0("psu_ndp_", format(Sys.time(), "%Y_%m_%d"), ".Rdata"))

save.image(rdata_path)
cat("Saved in:",
    rdata_path)

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
time_after_dedup2<-Sys.time()

paste0("Time in markdown: ");time_after_dedup2-time_before_dedup2
[1] "G:/My Drive/Alvacast/SISTRAT 2023/cons/cons"
[1] "G:/My Drive/Alvacast/SISTRAT 2023//data/20241015_out"
[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/data/20241015_out"
Saved in: G:/My Drive/Alvacast/SISTRAT 2023///data/20241015_out/psu/psu_ndp_2025_09_25.Rdata[1] "Time in markdown: "
Time difference of 6.39316 mins
Back to top